make-validating-source
[cxml-rng.git] / clex.lisp
blobe30492b5f4ec08665e152f9d441b747d64a06b22
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CLEX; -*-
2 ;;; --------------------------------------------------------------------------------------
3 ;;; Title: A flex like scanner generator for Common LISP
4 ;;; Created: 1997-10-12
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; --------------------------------------------------------------------------------------
8 ;;; (c) copyright 1997-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
17 ;;;
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
20 ;;;
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 ;;;
30 ;;; Changes
32 ;; When Who What
33 ;; ----------------------------------------------------------------------------
34 ;; 2007-04-29 DFL - Represent RANGE directly to cope with character
35 ;; set sizes typical for Unicode.
36 ;; - Disable *full-table-p* by default.
37 ;; - Added SBCL case to the CMUCL workarounds.
39 (defpackage :cxml-clex
40 (:use :cl :runes)
41 (:export
42 #:deflexer #:backup #:begin #:initial #:bag))
44 (in-package :cxml-clex)
46 ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL
47 ;;; compared to CLISP or ACL. Probably they do not have a good implementation of
48 ;;; bit vectors.
50 ;;; We encode our FSA's directly as linked datastructures; A state is represented by:
52 (defstruct (state (:type vector))
53 (final 0)
54 transitions ;simple alist of (sigma . next-state)
55 id ;numeric id of state
56 eps-transitions) ;list of all states reached by epsilon (empty transitions)
58 (defun destructure-range (x)
59 (if (listp x)
60 (values (car x) (cadr x))
61 (values x x)))
63 (defun range- (a b)
64 (multiple-value-bind (amin amax) (destructure-range a)
65 (multiple-value-bind (bmin bmax) (destructure-range b)
66 (incf amax)
67 (incf bmax)
68 (let ((result nil))
69 (flet ((range* (min max)
70 (when (< min max)
71 (push (list min (1- max)) result))))
72 (range* amin (min bmin amax))
73 (range* (max amin bmax) amax))
74 result))))
76 (defun ranges-range (aa b)
77 (mapcan (lambda (a) (range- a b)) aa))
79 (defun ranges- (aa b)
80 (dolist (l b)
81 (setf aa (ranges-range aa l)))
82 aa)
84 (defun partition-range (a pos)
85 (multiple-value-bind (min max) (destructure-range a)
86 (if (and (< min pos) (<= pos max))
87 (list (list min (1- pos))
88 (list pos max))
89 (list a))))
91 (defun code (x)
92 (typecase x
93 (integer x)
94 (character (char-code x))))
96 (defun parse-range (range)
97 (if (listp range)
98 (list (code (car range)) (code (cadr range)))
99 (list (code range) (code range))))
101 (defun state-add-link (this range that)
102 "Add a transition to state `this'; reading `range' proceeds to `that'."
103 (cond ((eq range 'eps)
104 (pushnew that (state-eps-transitions this)))
106 (let ((new (list (parse-range range))))
107 (dolist (k (state-transitions this)
108 (push (cons new that) (state-transitions this)))
109 (when (eq (cdr k) that)
110 (setf new (ranges- new (car k))) ;avoid duplicates
111 (setf (car k) (append new (car k)))
112 (return nil)))
113 ;; split existing ranges to remove overlap
114 (dolist (k (state-transitions this))
115 (flet ((doit (pos)
116 (setf (car k)
117 (mapcan (lambda (l)
118 (partition-range l pos))
119 (car k)))))
120 (dolist (n new)
121 (doit (car n))
122 (doit (1+ (cadr n))))))))))
124 ;;; When constructing FSA's from regular expressions we abstract by the notation
125 ;;; of FSA's as boxen with an entry and an exit state.
127 (defstruct fsa
128 start ;entry state
129 end) ;exit state
131 (defun fsa-empty ()
132 "Accepts the empty word."
133 (let ((q (make-state)))
134 (make-fsa :start q :end q)))
136 (defun fsa-trivial (char)
137 "Accepts the trivial word consisting out of exactly one `char'."
138 (let ((q0 (make-state))
139 (q1 (make-state)))
140 (state-add-link q0 char q1)
141 (make-fsa :start q0 :end q1)))
143 (defun fsa-concat (a1 a2)
144 "Concatenation of `a1' and `a2'. Hence `a1 a2'."
145 (state-add-link (fsa-end a1) 'eps (fsa-start a2))
146 (make-fsa :start (fsa-start a1)
147 :end (fsa-end a2)))
149 (defun fsa-iterate (a)
150 "Iteration of `a'. Hence `a*'"
151 (let ((q0 (make-state))
152 (q1 (make-state)))
153 (state-add-link q0 'eps (fsa-start a))
154 (state-add-link q0 'eps q1)
155 (state-add-link q1 'eps q0)
156 (state-add-link (fsa-end a) 'eps q1)
157 (make-fsa :start q0 :end q1)))
159 (defun fsa-branch (&rest as)
160 "Alternation of a0..an; Hence `a0 | a1 | ... | an'."
161 (let ((q0 (make-state))
162 (q1 (make-state)))
163 (dolist (a as)
164 (state-add-link q0 'eps (fsa-start a))
165 (state-add-link (fsa-end a) 'eps q1))
166 (make-fsa :start q0 :end q1)))
168 ;;;; ----------------------------------------------------------------------------------------------------
169 ;;;; Converting regular expressions to (ND)FSA
170 ;;;;
172 ;;; However we choose here a Lispy syntax for regular expressions:
174 ;;; a singelton
175 ;;; (and a0 .. an) concatation
176 ;;; (or a0 .. an) alternation
177 ;;; (* a) iteration
179 ;;; Further the abbrevs.:
180 ;;; (+ a) == (and a (* a))
181 ;;; (? a) == (or a (and))
182 ;;; (a0 ... an) == (and a0 ... an)
184 ;;; When a string embeded into a regular expression is seen, the list
185 ;;; of characters is spliced in. So formally:
186 ;;; (a0 .. ai "xyz" aj .. an) == (a0 .. ai #\x #\y #\z aj .. an)
188 ;;; This is useful for matching words:
189 ;;; "foo" --> (and "foo") --> (and #\f #\o #\o) == The word 'foo'
190 ;;; or for denoting small sets:
191 ;;; (or "+-") --> (or #\+ #\-) == One of '+' or '-'
193 (defun loose-eq (x y)
194 (cond ((eq x y))
195 ((and (symbolp x) (symbolp y))
196 (string= (symbol-name x) (symbol-name y)))))
198 (defun regexp->fsa (term)
199 (setf term (regexp-expand-splicing term))
200 (cond ((and (atom term) (not (stringp term)))
201 (fsa-trivial term))
202 ((loose-eq (car term) 'RANGE)
203 (fsa-trivial (cdr term)))
204 ((loose-eq (car term) 'AND) (regexp/and->fsa term))
205 ((loose-eq (car term) 'OR) (regexp/or->fsa term))
206 ((loose-eq (car term) '*) (fsa-iterate (regexp->fsa (cadr term))))
207 ((loose-eq (car term) '+) (regexp->fsa `(AND ,(cadr term) (* ,(cadr term)))))
208 ((loose-eq (car term) '?) (regexp->fsa `(OR (AND) ,(cadr term))))
210 (regexp->fsa `(AND .,term))) ))
212 (defun regexp/or->fsa (term)
213 ;; I optimize here a bit: I recognized, that ORs are mainly just
214 ;; (large) sets of characters. The extra epsilon transitions are not
215 ;; neccessary on single atoms, so I omit them here. -- This reduces the
216 ;; number of states quite a bit in the first place.
217 (let ((q0 (make-state))
218 (q1 (make-state)))
219 (dolist (a (cdr term))
220 (cond ((atom a)
221 (state-add-link q0 a q1))
222 ((let ((a (regexp->fsa a)))
223 (state-add-link q0 'eps (fsa-start a))
224 (state-add-link (fsa-end a) 'eps q1)))))
225 (make-fsa :start q0 :end q1)))
227 (defun regexp/and->fsa (term)
228 (cond ((null (cdr term)) (fsa-empty))
229 ((null (cddr term)) (regexp->fsa (cadr term)))
230 ((fsa-concat (regexp->fsa (cadr term)) (regexp->fsa `(and .,(cddr term)))))))
232 (defun regexp-expand-splicing (term)
233 (cond ((consp term)
234 (mapcan #'(lambda (x)
235 (cond ((stringp x) (coerce x 'list))
236 ((list x))))
237 term))
238 (t term)))
240 ;;;; ----------------------------------------------------------------------------------------------------
241 ;;;; Converting a ND-FSA to a D-FSA
242 ;;;;
244 ;;; Since we have to compare and unionfy sets of states a lot, I use bit-vectors
245 ;;; to represent these sets for speed. However let me abstract that a bit:
247 ;;; (All these are defined as macros simply for speed. Inlining would be an
248 ;;; option here, when it would be reliable. With defining macros I enforce
249 ;;; inlining).
251 (defmacro make-empty-set (n)
252 "Create the empty set on the domain [0,n)."
253 `(make-array ,n :element-type 'bit :initial-element 0))
255 (defmacro nset-put (bag new)
256 "Destructively calculate bag = bag U {new}."
257 `(setf (sbit (the (simple-array bit (*)) ,bag) (the fixnum ,new)) 1))
259 (defmacro element-of-set-p (elm set)
260 "Determine whether `elm' is element of the set `set'."
261 `(eq 1 (sbit (the (simple-array bit (*)) ,set) (the fixnum ,elm))))
263 (defmacro set-size (set)
264 "Return the upper bound of the domain of `set'."
265 `(length ,set))
267 (defmacro do-bits ((var set &optional result) &body body)
268 "Iterate body with `var' over all elements of `set'."
269 (let ((g/set (gensym)))
270 `(let ((,g/set ,set))
271 (dotimes (,var (set-size ,g/set) ,result)
272 (when (element-of-set-p ,var ,g/set)
273 ,@body)))))
275 ;;; Since the sets we defined above only take non-negative integers, we have to
276 ;;; number our states. This is done once by NUMBER-STATES.
278 (defun number-states (starts)
279 "Number all state reachable form `starts', continuosly from 0. Each state got
280 it's number stuck into the `id' slot.
281 Returns two values: `n' the number of states and `tab' a table to lookup a
282 state given the number it got attached to."
283 (let ((n 0)
284 (tab (make-array 0 :adjustable t :fill-pointer 0 :initial-element nil)))
285 (labels ((walk (x)
286 (unless (state-id x)
287 (vector-push-extend x tab 300)
288 (setf (state-id x) (prog1 n (incf n)))
289 (dolist (tr (state-transitions x))
290 (walk (cdr tr)))
291 (dolist (y (state-eps-transitions x))
292 (walk y)))))
293 (dolist (s starts) (walk s))
294 (values n tab))))
296 ;;; We need to calculate the epsilon closure of a given state. Due to the
297 ;;; precise workings of our algorithm below, we only need this augmenting
298 ;;; version.
300 (defun fsa-epsilon-closure/set (x state-set)
301 "Augment the epsilon closure of the state `state' into `state-set'."
302 (unless (element-of-set-p (state-id x) state-set)
303 (nset-put state-set (state-id x))
304 (dolist (k (state-eps-transitions x))
305 (fsa-epsilon-closure/set k state-set))))
307 (defun ndfsa->dfsa (starts)
308 (let ((batch nil)
309 (known nil))
310 (multiple-value-bind (n tab) (number-states starts)
311 (labels ((name-state-set (state-set)
312 (or (cdr (assoc state-set known :test #'equal))
313 (let ((new (make-state)))
314 (push (cons state-set new) known)
315 (push state-set batch)
316 new)))
317 (add-state-set (state-set)
318 (let ((new-tr (make-hash-table :test 'equal))
319 (new-tr-real nil)
320 (name (name-state-set state-set))
321 (new-final 0))
322 (do-bits (s0 state-set)
323 (let ((s (aref tab s0)))
324 (setf new-final (max new-final (state-final s)))
325 (dolist (tr (state-transitions s))
326 (let ((to (cdr tr)))
327 (dolist (z (car tr))
328 (let ((looked (gethash z new-tr)))
329 (if looked
330 (fsa-epsilon-closure/set to looked)
331 (let ((sts (make-empty-set n)))
332 (fsa-epsilon-closure/set to sts)
333 (setf (gethash z new-tr) sts)))))))))
334 (do ((q (frob2 new-tr) (cddr q)))
335 ((null q))
336 (let ((z (car q))
337 (to (cadr q)))
338 (push (cons z (name-state-set to)) new-tr-real)))
339 (setf (state-transitions name) new-tr-real
340 (state-final name) new-final))))
341 (prog1
342 (mapcar #'(lambda (s)
343 (name-state-set (let ((sts (make-empty-set n)))
344 (fsa-epsilon-closure/set s sts)
345 sts)))
346 starts)
347 (do ()
348 ((null batch))
349 (add-state-set (pop batch)))) ))))
351 (defun frob2 (res &aux res2)
352 (maphash (lambda (z to)
353 (do ((p res2 (cddr p)))
354 ((null p)
355 (setf res2 (list* (list z) to res2)))
356 (when (equal to (cadr p))
357 (setf (car p) (cons z (car p)))
358 (return))))
359 res)
360 res2)
362 ;;;; ----------------------------------------------------------------------------------------------------
363 ;;;; API
364 ;;;;
366 ;;; Features to think about:
367 ;;; - case insensitive scanner
368 ;;; - compression of tables
369 ;;; - debugging aids
370 ;;; - non-interactive high speed scanning?
371 ;;; - make BAG a macro? So that non used bags are not considered?
372 ;;; - REJECT?
373 ;;; - support for include?
374 ;;; - support for putting back input?
375 ;;; - count lines/columns? Track source?
376 ;;; - richer set of regexp primitives e.g. "[a-z]" style sets
377 ;;; - could we offer complement regexp?
378 ;;; - trailing context
379 ;;; - sub-state stacks?
380 ;;; - user variables to include ['global' / 'lexical']
381 ;;; - identifing sub-expression of regexps (ala \(..\) and \n)
384 #-(OR CMU SBCL GCL)
385 (defun loadable-states-form (starts)
386 `',starts)
388 #+(OR CMU SBCL GCL)
389 ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die
390 ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-(
391 ;; Deswegen hier dieser read-from-string Hack.
392 (defun loadable-states-form (starts)
393 `(LET ((*PACKAGE* (FIND-PACKAGE ',(package-name *package*))))
394 (READ-FROM-STRING ',(let ((*print-circle* t)
395 (*print-readably* t)
396 (*print-pretty* nil))
397 (prin1-to-string starts)))))
399 ;;;; ----------------------------------------------------------------------------------------------------
400 ;;;;
402 (defun parse-char-set (string i)
403 (let ((res nil)
404 (complement-p nil))
405 (incf i) ;skip '['
406 ;;the first char is special
407 (cond ((char= (char string i) #\]) (incf i) (push #\] res))
408 ((char= (char string i) #\^) (incf i) (setq complement-p t))
409 ((char= (char string i) #\-) (incf i) (push #\- res)))
410 (do ()
411 ((char= (char string i) #\])
412 (values (if complement-p (cons 'cset res) (cons 'set res)) (+ i 1)))
413 (cond ((char= (char string (+ i 1)) #\-)
414 ;;it's a range
415 (push (cons (char string i) (char string (+ i 2))) res)
416 (incf i 3))
418 ;;singleton
419 (push (char string i) res)
420 (incf i))))))
422 ;;;; ------------------------------------------------------------------------------------------
424 (defparameter *full-table-p* nil)
426 (defun mungle-transitions (trs)
427 (if *full-table-p*
428 (let ((res (make-array 256 :initial-element nil)))
429 (dolist (tr trs)
430 (dolist (range (car tr))
431 (loop
432 for code from (car range) to (cadr range)
433 do (setf (aref res code) (cdr tr)))))
434 res)
435 trs))
437 (defun over-all-states (fun starts)
438 ;; Apply `fun' to each state reachable from starts.
439 (let ((yet nil))
440 (labels ((walk (q)
441 (unless (member q yet)
442 (push q yet)
443 (let ((trs (state-transitions q)))
444 (funcall fun q)
445 (dolist (tr trs)
446 (walk (cdr tr)))))))
447 (mapc #'walk starts))))
449 (defmacro deflexer (name macro-defs &rest rule-defs)
450 (let ((macros nil) starts clauses (n-fin 0))
451 (dolist (k macro-defs)
452 (push (cons (car k) (sublis macros (cadr k))) macros))
453 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
454 (setq rule-defs
455 (mapcar #'(lambda (x)
456 (cond ((and (consp (car x)) (string-equal (caar x) :in))
457 (list (cadar x) (sublis macros (caddar x)) (progn (incf n-fin) n-fin) (cdr x)))
458 ((list 'initial (sublis macros (car x)) (progn (incf n-fin) n-fin) (cdr x)))))
459 (reverse rule-defs)))
460 ;;collect all start states in alist (<name> . <state>)
461 (setq starts (mapcar #'(lambda (name)
462 (cons name (make-state)))
463 (remove-duplicates (mapcar #'car rule-defs))))
464 ;;build the nd-fsa's
465 (dolist (r rule-defs)
466 (destructuring-bind (start expr end action) r
467 (let ((q0 (cdr (assoc start starts)))
468 (fsa (regexp->fsa `(and ,expr))))
469 ;;link start state
470 (state-add-link q0 'eps (fsa-start fsa))
471 ;;mark final state
472 (setf (state-final (fsa-end fsa)) end)
473 ;; build a clause for CASE
474 (push `((,end) .,action) clauses))))
475 ;; hmm... we have to sort the final states after building the dfsa
476 ;; or introduce fixnum identifier and instead of union take the minimum
477 ;; above in ndfsa->dfsa.
478 (progn
479 (mapcar #'(lambda (x y) (setf (cdr x) y))
480 starts (ndfsa->dfsa (mapcar #'cdr starts))))
481 ;;(terpri)(princ `(,(number-states starts) states))(finish-output)
482 (let ((n 0))
483 (over-all-states (lambda (state)
484 (incf n)
485 (setf (state-transitions state)
486 (mungle-transitions (state-transitions state))))
487 (mapcar #'cdr starts))
488 (format T "~&~D states." n))
489 `(DEFUN ,(intern (format nil "MAKE-~A-LEXER" name)) (INPUT)
490 (LET* ((STARTS ,(loadable-states-form starts))
491 (SUB-STATE 'INITIAL)
492 (STATE NIL)
493 (LOOK-AHEAD NIL)
494 (BAGG/CH (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T
495 :ELEMENT-TYPE 'CHARACTER))
496 (BAGG/STATE (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T))
497 (CH NIL))
498 #'(LAMBDA ()
499 (BLOCK NIL
500 (LABELS ((BEGIN (X)
501 (SETQ SUB-STATE X))
502 (BACKUP (CH)
503 (COND ((STRINGP CH)
504 (WHEN (> (LENGTH CH) 0)
505 (PUSH (CONS 0 CH) LOOK-AHEAD)))
506 (T (PUSH CH LOOK-AHEAD))))
507 (PUSH* (CH STATE)
508 (VECTOR-PUSH-EXTEND CH BAGG/CH 10)
509 (VECTOR-PUSH-EXTEND STATE BAGG/STATE 10) )
510 (POP*/CH ()
511 (LET ((FP (LENGTH BAGG/CH)))
512 (PROG1 (CHAR BAGG/CH (1- FP))
513 (SETF (FILL-POINTER BAGG/STATE) (1- FP))
514 (SETF (FILL-POINTER BAGG/CH) (1- FP)))))
515 (TOS*/STATE ()
516 (AREF BAGG/STATE (1- (LENGTH BAGG/STATE))) )
517 (EMPTY*? ()
518 (= (LENGTH BAGG/CH) 0))
519 (REWIND* ()
520 (SETF (FILL-POINTER BAGG/CH) 0)
521 (SETF (FILL-POINTER BAGG/STATE) 0) )
522 (STRING* ()
523 (COPY-SEQ BAGG/CH))
524 (GETCH ()
525 (COND ((NULL LOOK-AHEAD) (READ-CHAR INPUT NIL NIL))
526 ((CONSP (CAR LOOK-AHEAD))
527 (LET ((S (CDAR LOOK-AHEAD)))
528 (PROG1
529 (CHAR S (CAAR LOOK-AHEAD))
530 (INCF (CAAR LOOK-AHEAD))
531 (WHEN (= (CAAR LOOK-AHEAD) (LENGTH S))
532 (POP LOOK-AHEAD)))))
533 (T (POP LOOK-AHEAD)) ))
534 ,(if *full-table-p*
535 `(FIND-NEXT-STATE (STATE CH)
536 (IF (CHARACTERP CH)
537 (SVREF (STATE-TRANSITIONS STATE) (CHAR-CODE CH))
538 NIL))
539 `(FIND-NEXT-STATE (STATE CH)
540 (WHEN ch
541 (BLOCK FOO
542 (DOLIST (K (STATE-TRANSITIONS STATE))
543 (DOLIST (Q (CAR K))
544 (WHEN (<= (CAR Q) (CHAR-CODE CH) (CADR q))
545 (RETURN-FROM FOO (CDR K))))))))) )
546 (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE)
547 (IGNORABLE #'BEGIN))
548 (TAGBODY
549 START (SETQ STATE (CDR (ASSOC SUB-STATE STARTS)))
550 (WHEN (NULL STATE)
551 (ERROR "Sub-state ~S is not defined." SUB-STATE))
552 (REWIND*)
553 LOOP (SETQ CH (GETCH))
554 (LET ((NEXT-STATE (FIND-NEXT-STATE STATE CH)) )
555 (COND ((NULL NEXT-STATE)
556 (BACKUP CH)
557 (DO ()
558 ((OR (EMPTY*?) (NOT (EQ 0 (TOS*/STATE)))))
559 (BACKUP (POP*/CH)))
560 (COND ((AND (EMPTY*?) (NULL CH))
561 (RETURN :EOF))
562 ((EMPTY*?)
563 (ERROR "oops at ~A: ~S ~S"
564 (file-position (cxml-rng::stream-source INPUT))
566 (mapcar #'car (state-transitions state))))
568 (LET ((HALTING-STATE (TOS*/STATE)))
569 (LET ((BAG* NIL))
570 (SYMBOL-MACROLET ((BAG (IF BAG*
571 BAG*
572 (SETF BAG* (STRING*)))))
573 (CASE HALTING-STATE
574 ,@clauses)))
575 (GO START)))))
577 (PUSH* CH (STATE-FINAL NEXT-STATE))
578 (SETQ STATE NEXT-STATE)
579 (GO LOOP))))))))))))