1 ;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; -*-
3 ;;; pegutils.lisp --- Utility functions for implementing PEG parsers
5 ;; Copyright (C) 2008 Utz-Uwe Haus <lisp@uuhaus.de>
8 ;; This code is free software; you can redistribute it and/or modify
9 ;; it under the terms of the version 2.1 of the GNU Lesser General
10 ;; Public License as published by the Free Software Foundation, as
11 ;; clarified by the lisp prequel found in LICENSE.
13 ;; This code is distributed in the hope that it will be useful, but
14 ;; without any warranty; without even the implied warranty of
15 ;; merchantability or fitness for a particular purpose. See the GNU
16 ;; Lesser General Public License for more details.
18 ;; Version 2.1 of the GNU Lesser General Public License is in the file
19 ;; LICENSE that was distributed with this file. If it is not
20 ;; present, you can access it from
21 ;; http://www.gnu.org/copyleft/lgpl.txt (until superseded by a
22 ;; newer version) or write to the Free Software Foundation, Inc., 59
23 ;; Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 ;; Some code here is inspired by the metapeg library of John Leuner
32 (eval-when (:load-toplevel
:compile-toplevel
:execute
)
33 (declaim (optimize (speed 0)
38 (in-package #:opossum
)
40 (defparameter *trace
* nil
"If non-nil, do extensive tracing of the parser functions.")
44 (;; these slots are copied when cloning a context for recursion
45 (input :accessor input
:initarg
:input
:initform nil
47 :documentation
"The input string being parsed.")
48 (dst-package :accessor dst-package
:initarg
:dst-package
:initform nil
50 :documentation
"The package into which symbols generated during the parse are interned.")
51 (memotab :accessor memotab
:initarg
:memotab
:initform
(make-hash-table :test
#'equal
)
53 :documentation
"Hash-table used to memoize parsing result. Keyed on (fun . offset) pairs.")
54 ;; these slots are shared by all cloned copies of a context -- use only STORE-ACTION to guarantee consistency
55 (actions :accessor actions
:initarg
:actions
:initform
(make-list 1 :initial-element NIL
)
57 :documentation
"The list of actions accumulated during the parse.")
58 (action-counter :accessor action-counter
:initarg
:action-counter
:initform
'(0)
59 :type
(cons (integer 0) null
)
60 :documentation
"The counter of actions.")
61 ;; these slots are what make a context unique
62 (parent :accessor parent
:initarg
:parent
:initform nil
63 :documentation
"Parent context of this context.")
64 (rule :accessor rule
:initarg
:rule
:initform nil
65 :documentation
"Rule name in this context.")
66 (children :accessor children
:initform nil
)
67 (value :accessor value
:initarg
:value
:initform nil
68 :documentation
"Accumulated value after successful matching of rule in this context.")
69 (start-index :accessor start-index
:initarg
:start-index
:initform nil
70 :documentation
"Position in INPUT where this context starts.")
71 (end-index :accessor end-index
:initarg
:end-index
:initform nil
72 :documentation
"Position in INPUT where this context's match ends.")
73 (depth :accessor depth
:initarg
:depth
:initform
0
74 :documentation
"How deep in the tree is this context?"))
75 (:documentation
"A parser context object."))
77 (defmethod print-object ((obj context
) stream
)
78 (print-unreadable-object (obj stream
:type T
:identity NIL
)
79 (format stream
"rule ~A (~S) value ~S (~D:~D)"
80 (rule obj
) (children obj
) (value obj
) (start-index obj
) (end-index obj
))))
82 (defmethod store-action ((ctx context
) action
)
83 "Store ACTION in context CTX."
84 (let ((a (actions ctx
)))
85 (rplacd a
(cons (car a
) (cdr a
)))
88 (defvar *context
* nil
"The current parser context.")
90 (defun clone-ctx (ctx rule
)
91 "Create clone context of CTX for rule RULE."
92 (make-instance 'context
94 :dst-package
(dst-package ctx
)
95 :memotab
(memotab ctx
)
96 :actions
(actions ctx
)
97 :action-counter
(action-counter ctx
)
100 :start-index
(end-index ctx
)
101 :depth
(1+ (depth ctx
))))
103 (defun ctx-failed-p (ctx)
104 "Check whether CTX failed to match."
105 (null (end-index ctx
)))
107 (defun succeed (ctx value start-index end-index
)
108 "Mark CTX as successful: set VALUE and matched region START-INDEX:END-INDEX."
109 (setf (value ctx
) value
)
110 (setf (start-index ctx
) start-index
)
111 (setf (end-index ctx
) end-index
)
113 ;; (format *trace-output* "Matched: ~A (~D:~D)~%"
114 ;; (rule ctx) (start-index ctx) (end-index ctx)))
118 "Return a failure context generate from *CONTEXT*."
119 (let ((ctx (make-instance 'context
120 :input
(input *context
*)
122 :value
(rule *context
*)
123 :start-index
(start-index *context
*)
124 :end-index
(end-index *context
*)
125 ;; probably some of these copies can be saved
126 :dst-package
(dst-package *context
*)
127 :actions
(actions *context
*)
128 :action-counter
(action-counter *context
*)
129 :depth
(1+ (depth *context
*)))))
131 ;; (format *trace-output* "(failed: ~A ~A ~A)~%"
132 ;; (value ctx) (start-index ctx) (end-index ctx)))
135 (defun find-memoized-value (name offset
&optional
(ctx *context
*))
136 "Return a memoized value for FUN at OFFSET, or NIL."
137 (let ((res (gethash `(,name .
,offset
) (memotab ctx
))))
139 (format *trace-output
* "~vT~A memoized result.~%" (depth ctx
) (if res
"Found" "No")))
142 (defun memoizing (name offset result-ctx
&optional
(ctx *context
*))
144 (format *trace-output
* "~vT(memoizing for ~A/~D)~%" (depth ctx
) name offset
))
145 (setf (gethash `(,name .
,offset
) (memotab ctx
))
150 (defun make-name (rule-string)
151 "Create a symbol suitable for naming the parser function for rule RULE-STRING."
152 (intern (concatenate 'string
"parse-" rule-string
)
153 (dst-package *context
*)))
155 (defun make-action-name (&key ctx
)
156 "Return a symbol suitable to name the next action in the current *CONTEXT*."
157 (incf (car (action-counter *context
*)))
159 (format nil
"opossum-action-~D-srcpos-~D-~D"
160 (car (action-counter *context
*))
163 (format nil
"opossum-action-~D"
164 (car (action-counter *context
*)))) ))
165 (intern aname
(dst-package *context
*))))
167 (defun char-list-to-string (char-list)
168 (coerce char-list
'string
))
170 (defmacro build-parser-function
(name parser
)
171 "Return a function of 1 argument, the offset in *CONTEXT*, parsing using the given PARSER."
173 ,(format nil
"Parse a ~A at the given OFFSET." name
)
174 (let ((indent (depth *context
*)))
175 (when *trace
* (format *trace-output
* "~vTTrying to parse a ~A at pos ~D~%" indent
,name offset
))
176 (or (find-memoized-value ,name offset
)
177 (let* ((*context
* (clone-ctx *context
* ,name
))
178 (result (funcall ,parser offset
)))
180 (error "Parser function ~A did not return a value" ,parser
))
181 (if (ctx-failed-p result
)
183 (when *trace
* (format *trace-output
* "~vT... no ~A at pos ~D~%" indent
,name offset
))
184 (memoizing ,name offset
(fail)))
186 (when *trace
* (format *trace-output
* "~vT... found ~A at ~D:~D~%"
188 ,name
(start-index result
) (end-index result
)))
189 (memoizing ,name offset
(succeed *context
* (value result
) (start-index result
) (end-index result
))))))))))
193 (defun match-string (string)
194 "Return a function of 1 argument, the offset in *CONTEXT*, that tries to match STRING at that position."
196 (let ((input (input *context
*))
197 (len (length string
)))
198 (if (and (>= (length input
) (+ offset len
))
199 (string= string input
:start2 offset
:end2
(+ offset len
)))
200 (succeed (clone-ctx *context
* 'opossum-string
) string offset
(+ offset
(length string
)))
203 (defun match-char (char-list)
205 (let ((input (input *context
*)))
207 ;; (format *trace-output* "match-char: looking for one of `~{~A~}'~%" char-list))
208 (if (and (> (length input
) offset
)
209 (member (char input offset
)
210 char-list
:test
#'char
=))
211 (succeed (clone-ctx *context
* 'opossum-char
) (char input offset
) offset
(+ offset
1))
214 (defun match-octal-char-code (i1 i2 i3
)
215 "Compare the character given by i3 + 8*i2 + 64*i1 to the next input character."
216 (let ((c (+ i3
(* 8 i2
) (* 64 i1
))))
218 (let ((input (input *context
*)))
220 ;; (format *trace-output* "match-octal-char-code: looking for ~D~%" c))
221 (if (and (> (length input
) offset
)
222 (= (char-int (char input offset
)) c
))
223 (succeed (clone-ctx *context
* 'opossum-char
) (char input offset
) offset
(+ offset
1))
226 (defun match-char-range (lower-char upper-char
)
227 "Match characters in the range between LOWER-CHAR and UPPER-CHAR (inclusive) as decided by CL:CHAR-CODE."
229 (let ((input (input *context
*)))
231 ;; (format *trace-output* "match-char-range: looking for ~A-~A~%" lower-char upper-char))
232 (if (and (> (length input
) offset
)
233 (let ((x (char-code (char input offset
))))
234 (and (>= x
(char-code lower-char
))
235 (<= x
(char-code upper-char
)))))
236 (succeed (clone-ctx *context
* 'opossum-char-range
) (char input offset
) offset
(+ offset
1))
239 (defun match-any-char (&optional ignored
)
240 (declare (ignore ignored
))
242 "Match any character at OFFSET, fail only on EOF."
243 (let ((input (input *context
*)))
245 ;; (format *trace-output* "match-any-char~%"))
246 (if (< (1+ offset
) (length input
))
247 (succeed (clone-ctx *context
* 'opossum-anychar
) (char input offset
) offset
(+ offset
1))
250 (defun match-char-class (char-class)
251 "Regexp matching of next input character against [CHAR-CLASS] using cl-ppcre:scan."
252 (declare (type string char-class
))
253 ;; FIXME: could use a pre-computed scanner
254 (let ((cc (format nil
"[~A]" char-class
)))
256 "Match next character at OFFSET against the characters in CHAR-CLASS."
257 (let ((input (input *context
*)))
259 ;; (format *trace-output* "match-char-class on ~A~%"))
260 (if (and (< (1+ offset
) (length input
))
261 (let ((c (char input offset
)))
262 (cl-ppcre:scan cc
(make-string 1 :initial-element c
))))
263 (succeed (clone-ctx *context
* 'opossum-charclass
)
264 (char input offset
) offset
(+ offset
1))
267 (defun fix-escape-sequences (char-list)
268 "Iterate over the list CHAR-LIST, glueing adjacent #\\ #\n and #\\ #\t chars into
269 #\Newline and #\Tab."
271 ((null char-list
) char-list
)
272 ((null (cdr char-list
)) char-list
)
274 (loop :with drop
:= nil
275 :for
(c1 c2
) :on char-list
279 :collect
(if (char= c1
#\\)
281 ((#\n) (setf drop T
) #\Newline
)
282 ((#\t) (setf drop T
) #\Tab
)
283 ((#\r) (setf drop T
) #\Linefeed
)
289 ;; parsing combinator functions cribbed from libmetapeg
291 (defun either (&rest parsers
)
292 "Produce a function that tries each of the functions in PARSERS sequentially until one succeeds and
293 returns the result of that function, or a failure context if none succeeded."
295 (let ((*context
* (clone-ctx *context
* 'opossum-either
)))
297 ;; (format *trace-output* "either: ~A ~A~%" *context* parsers))
298 (loop :for p
:in parsers
299 :as result
= (funcall p offset
)
300 :when
(not (ctx-failed-p result
))
301 :return
(succeed *context
* (value result
) offset
(end-index result
))
302 :finally
(return (fail))))))
305 (defun optional (parser)
307 (let ((*context
* (clone-ctx *context
* 'opossum-optional
)))
309 ;; (format *trace-output* "optional: ~A ~A~%" *context* parser))
310 (let ((result (funcall parser offset
)))
311 (if (ctx-failed-p result
)
312 (succeed *context
* 'optional offset offset
)
313 (succeed *context
* (value result
) offset
(end-index result
)))))))
315 (defun follow (parser)
317 (let ((*context
* (clone-ctx *context
* 'opossum-follow
)))
319 ;; (format *trace-output* "follow: ~A ~A~%" *context* parser))
320 (let ((result (funcall parser offset
)))
321 (if (ctx-failed-p result
)
323 (succeed *context
* (value result
)
324 ;; don't consume input
329 (let ((*context
* (clone-ctx *context
* 'opossum-many
))
330 (start-offset offset
)
333 ;; (format *trace-output* "many: ~A ~A~%" *context* parser))
334 (loop :as result
:= (funcall parser offset
)
335 :while
(not (ctx-failed-p result
))
336 :do
(progn (push (value result
) children
)
337 (setf offset
(end-index result
)))
338 :finally
(return (succeed *context
* (nreverse children
) start-offset offset
))))))
341 (defun many1 (parser)
343 (let* ((*context
* (clone-ctx *context
* 'opossum-many1
))
344 (result (funcall parser offset
)))
346 ;; (format *trace-output* "many1: ~A ~A~%" *context* parser))
347 (if (not (ctx-failed-p result
))
348 (let ((result2 (funcall (many parser
) (end-index result
))))
349 (if (end-index result2
)
350 (succeed *context
* (cons (value result
) (value result2
)) offset
(end-index result2
))
351 (succeed *context
* (value result
) offset
(end-index result
))))
355 (defun seq (&rest parsers
)
357 (assert (> (length parsers
) 0))
358 (let ((*context
* (clone-ctx *context
* 'opossum-seq
))
359 (start-offset offset
)
363 ;; (format *trace-output* "seq: ~A ~A~%" *context* parsers))
365 (loop :for p
:in parsers
366 ;; :do (when *trace* (format *trace-output* " (seq ~A) trying ~A~%" *context* p))
369 (push (succeed (clone-ctx *context
* 'action
) nil offset offset
) child-nodes
)
370 (push p child-values
)
371 (setf (children *context
*) (reverse child-nodes
)))
373 (let ((result (funcall p offset
)))
374 (if (end-index result
)
376 (push result child-nodes
)
377 (push (value result
) child-values
)
378 (setf offset
(end-index result
))
379 (setf (children *context
*) (reverse child-nodes
)))
381 :finally
(return (succeed *context
* (reverse child-values
) start-offset offset
))))))
383 (defun negate (parser)
385 "Return a successful context at OFFSET if PARSER succeeds, without advancing input position."
386 (let ((*context
* (clone-ctx *context
* 'opossum-negate
)))
387 (let ((result (funcall parser offset
)))
388 (if (ctx-failed-p result
)
389 (succeed *context
* 'negate offset offset
)
394 (defun read-stream (stream)
395 "Read STREAM and return a string of all its contents."
397 (loop :as line
:= (read-line stream nil nil
)
399 :do
(setq s
(concatenate 'string s line
)))
402 (defun read-file (file)
403 "Read FILE and return a string of all its contents."
404 (with-open-file (f file
:direction
:input
)
405 (let ((len (file-length f
)))
407 (let ((s (make-string len
)))
412 (defun make-default-dst-package (grammarfile)
413 (let ((pkg (make-package (gensym "opossum-parser"))))
414 (setf (documentation pkg
'cl
:package
)
415 (format T
"Opossum parser for grammar ~A" grammarfile
))
418 (defun get-iso-time ()
419 "Return a string in ISO format for the current time"
420 (multiple-value-bind (second minute hour date month year day daylight-p zone
)
422 (declare (ignore day
))
423 (format nil
"~4D-~2,'0D-~2,'0D-~2,'0D:~2,'0D:~2,'0D (UCT~@D)"
426 (if daylight-p
(1+ (- zone
)) (- zone
)))))
428 (defun cleanup-action-code (code)
429 "Remove trailing newlines so that the string CODE can be printed nicely."
431 (when (char= #\Newline
(char code
(1- (length code
))))
432 (1+ (position #\Newline code
:from-end T
:test
#'char
/=)))))
434 (defmacro checking-parse
(grammarfile parse-file-fun
)
435 (let ((res (gensym "resultctx")))
436 `(let ((,res
(funcall ,parse-file-fun
,grammarfile
*package
*)))
439 (format *error-output
* "Failed to parse PEG grammar ~A~%" ,grammarfile
)
440 (error "Parsing ~A failed: ~A" ,grammarfile
,res
))
441 ((< (end-index ,res
) (length (input ,res
)))
442 (format *error-output
* "Parsed only ~D characters of grammar ~A~%" (end-index ,res
) ,grammarfile
)
446 (defun generate-parser-file (grammarfile dst-package dst-file
&key start-rule
(parse-file-fun (symbol-function 'opossum
:parse-file
)))
447 "Create lisp code in DST-FILE that can be loaded to yield functions to parse using GRAMMARFILE in DST-PACKAGE.
448 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
449 (let* ((*package
* (find-package dst-package
))
450 (result (checking-parse grammarfile parse-file-fun
))
451 ;; FIXME: check for complete parse
452 (*context
* result
) ;; routines in pegutils.lisp expect *context* to be bound properly
453 (dpkg (intern (package-name dst-package
) :keyword
)))
454 (let ((forms (transform (value result
)))
455 (actions (actions result
)))
456 (with-open-file (s dst-file
:direction
:output
:if-exists
:supersede
)
457 (let ((*print-readably
* T
)
459 (*print-circle
* NIL
))
460 (format s
";; This is a Common Lisp peg parser automatically generated by OPOSSUM -*- mode:lisp -*-~%")
461 (format s
";; generated from ~A on ~A~%" grammarfile
(get-iso-time))
462 (prin1 `(eval-when (:load-toplevel
:compile-toplevel
:execute
)
463 (declaim (optimize (speed 0) (safety 3) (debug 3))))
466 (prin1 `(defpackage ,dpkg
468 (:export
:parse-string
:parse-file
:parse-stream
:*trace
*))
471 (prin1 `(in-package ,dpkg
) s
) (terpri s
)
472 ;; First form is taken to be the start rule
473 (let ((entryrule (or (and start-rule
(make-name start-rule
))
474 (and forms
(cadr (first forms
))))))
476 (format *error-output
* "Cannot find entry rule for parser")
479 (format *trace-output
* "Inserting definitions for parser entry points through ~A~%"
482 (prin1 `(defun parse-string (,(intern "s" dst-package
) dst-package
)
483 ,(format nil
"Parse S using grammar ~A starting at ~A" grammarfile entryrule
)
484 (let ((*context
* (make-instance 'opossum
:context
485 :dst-package dst-package
486 :input
,(intern "s" dst-package
))))
487 (funcall (,entryrule
) 0)))
490 (prin1 `(defun parse-file (,(intern "f" dst-package
) dst-package
)
491 ,(format nil
"Parse file F using grammar ~A starting at ~A" grammarfile entryrule
)
492 (parse-string (opossum:read-file
,(intern "f" dst-package
)) dst-package
))
495 (prin1 `(defun parse-stream (,(intern "stream" dst-package
) dst-package
)
496 ,(format nil
"Parse stream F using grammar ~A starting at ~A" grammarfile entryrule
)
497 (parse-string (opossum:read-stream
,(intern "stream" dst-package
)) dst-package
))
500 (loop :for aform
:in forms
501 :do
(when *trace
* (format *trace-output
* "Inserting form ~A~%" aform
))
507 `(defparameter ,(intern "*trace*" dst-package
) nil
508 "When non-nil, the generated parser function log to cl:*trace-output*.")
512 (loop :for
(sym code
) :in actions
513 :when sym
;; the final action is named NIL because we push a
514 ;; NIL ahead of us in store-actions
515 :do
(when *trace
* (format *trace-output
* "Inserting defun for ~A~%" sym
))
516 :and
:do
(format s
"~%(defun ~S (data)~% (declare (ignorable data) (type list data))~% ~A)~%"
517 sym
(cleanup-action-code code
))))))))
519 (defun generate-parser-package (grammarfile &key
(dst-package (make-package (gensym "opossum-parser-")))
520 start-rule
(parse-file-fun (symbol-function 'opossum
:parse-file
)))
521 "Create functions to parse using GRAMMARFILE in DST-PACKAGE, starting ar rule named HEAD.
522 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
523 (let* ((*package
* dst-package
)
524 (result (checking-parse grammarfile parse-file-fun
))
525 ;; FIXME: check for complete parse
526 (*context
* result
)) ;; routines in pegutils.lisp expect *context* to be bound properly)
527 (let ((forms (transform (value result
)))
528 (actions (actions result
)))
529 (format *trace-output
* "Injecting parser functions into ~A~%" dst-package
)
530 (break "~A, ~A" forms actions
)
531 (use-package '(:cl
:opossum
) dst-package
)
532 (let ((entryrule (or (and start-rule
(make-name start-rule
))
533 (and forms
(cadr (first forms
))))))
535 (format *error-output
* "Cannot find entry rule for parser")
538 (format *trace-output
* "Inserting definitions for parser entry points through ~A~%"
541 (compile 'parse-string
542 `(lambda (,(intern "s" dst-package
))
543 ,(format nil
"Parse S using grammar ~A starting at ~A" grammarfile entryrule
)
544 (let ((*context
* (make-instance 'opossum
:context
545 :dst-package
,dst-package
546 :input
,(intern "s" dst-package
))))
547 (funcall (,entryrule
) 0)))))
551 `(lambda (,(intern "f" dst-package
))
552 ,(format nil
"Parse file F using grammar ~A starting at ~A" grammarfile entryrule
)
553 (parse-string (opossum:read-file
,(intern "f" dst-package
))))))
557 (compile 'parse-stream
558 `(lambda (,(intern "stream" dst-package
))
559 ,(format nil
"Parse stream F using grammar ~A starting at ~A" grammarfile entryrule
)
560 (parse-string (opossum:read-stream
,(intern "stream" dst-package
))))))
562 (intern "*TRACE*" dst-package
)
563 (setf (documentation (find-symbol "*TRACE*" dst-package
) 'cl
:variable
)
564 "When non-nil, the generated parser function log to cl:*trace-output*.")
565 (export '(:parse-string
:parse-file
:parse-stream
:*trace
*) dst-package
)
567 (loop :for aform
:in forms
569 (format *trace-output
* "Injecting form ~A~%" aform
))
570 :do
(destructuring-bind (defun-sym name args
&rest body
)
572 (declare (ignore defun-sym
))
574 (compile name
`(lambda ,args
,@body
))) dst-package
)))
575 (loop :for
(sym code
) :in actions
577 :do
(when *trace
* (format *trace-output
* "Injecting definition for ~A~%" sym
))
578 :and
:do
(intern (symbol-name
579 (compile sym
`(lambda (data) (declare (ignorable data
)) ,code
))) dst-package
)))))
583 (defun transform (tree &optional
(depth 0))
586 (if (eq (first tree
) ':action
)
589 (format *trace-output
* "~AFound action ~A~%" (make-string depth
:initial-element
#\Space
) tree
))
591 (let ((data (mapcar #'(lambda (tr) (transform tr
(1+ depth
)))
593 (loop :for el
:in data
594 :when
(and (listp el
)
595 (eq (first el
) ':action
)
596 (symbolp (third el
)))
597 :do
(let ((*package
* (dst-package *context
*))
600 (format *trace-output
* "~&Applying action ~A to ~A~%" action data
))
602 (return-from transform
603 (funcall (symbol-function action
) data
))
604 (undefined-function (x)
606 (format *error-output
* "missing definition for ~A: ~A~%" action x
)
607 ; (break "~A in ~A" action *package*)