1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CLEX; -*-
2 ;;; --------------------------------------------------------------------------------------
3 ;;; Title: A flex like scanner generator for Common LISP
4 ;;; Created: 1997-10-12
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; --------------------------------------------------------------------------------------
8 ;;; (c) copyright 1997-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33 ;; ----------------------------------------------------------------------------
34 ;; 2007-04-29 DFL - Represent RANGE directly to cope with character
35 ;; set sizes typical for Unicode.
36 ;; - Disable *full-table-p* by default.
37 ;; - Added SBCL case to the CMUCL workarounds.
39 (defpackage :cxml-clex
42 #:deflexer
#:backup
#:begin
#:initial
#:bag
))
44 (in-package :cxml-clex
)
46 ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL
47 ;;; compared to CLISP or ACL. Probably they do not have a good implementation of
50 ;;; We encode our FSA's directly as linked datastructures; A state is represented by:
52 (defstruct (state (:type vector
))
54 transitions
;simple alist of (sigma . next-state)
55 id
;numeric id of state
56 eps-transitions
) ;list of all states reached by epsilon (empty transitions)
58 (defun destructure-range (x)
60 (values (car x
) (cadr x
))
64 (multiple-value-bind (amin amax
) (destructure-range a
)
65 (multiple-value-bind (bmin bmax
) (destructure-range b
)
69 (flet ((range* (min max
)
71 (push (list min
(1- max
)) result
))))
72 (range* amin
(min bmin amax
))
73 (range* (max amin bmax
) amax
))
76 (defun ranges-range (aa b
)
77 (mapcan (lambda (a) (range- a b
)) aa
))
81 (setf aa
(ranges-range aa l
)))
84 (defun partition-range (a pos
)
85 (multiple-value-bind (min max
) (destructure-range a
)
86 (if (and (< min pos
) (<= pos max
))
87 (list (list min
(1- pos
))
94 (character (char-code x
))))
96 (defun parse-range (range)
98 (list (code (car range
)) (code (cadr range
)))
99 (list (code range
) (code range
))))
101 (defun state-add-link (this range that
)
102 "Add a transition to state `this'; reading `range' proceeds to `that'."
103 (cond ((eq range
'eps
)
104 (pushnew that
(state-eps-transitions this
)))
106 (let ((new (list (parse-range range
))))
107 (dolist (k (state-transitions this
)
108 (push (cons new that
) (state-transitions this
)))
109 (when (eq (cdr k
) that
)
110 (setf new
(ranges- new
(car k
))) ;avoid duplicates
111 (setf (car k
) (append new
(car k
)))
113 ;; split existing ranges to remove overlap
114 (dolist (k (state-transitions this
))
118 (partition-range l pos
))
122 (doit (1+ (cadr n
))))))))))
124 ;;; When constructing FSA's from regular expressions we abstract by the notation
125 ;;; of FSA's as boxen with an entry and an exit state.
132 "Accepts the empty word."
133 (let ((q (make-state)))
134 (make-fsa :start q
:end q
)))
136 (defun fsa-trivial (char)
137 "Accepts the trivial word consisting out of exactly one `char'."
138 (let ((q0 (make-state))
140 (state-add-link q0 char q1
)
141 (make-fsa :start q0
:end q1
)))
143 (defun fsa-concat (a1 a2
)
144 "Concatenation of `a1' and `a2'. Hence `a1 a2'."
145 (state-add-link (fsa-end a1
) 'eps
(fsa-start a2
))
146 (make-fsa :start
(fsa-start a1
)
149 (defun fsa-iterate (a)
150 "Iteration of `a'. Hence `a*'"
151 (let ((q0 (make-state))
153 (state-add-link q0
'eps
(fsa-start a
))
154 (state-add-link q0
'eps q1
)
155 (state-add-link q1
'eps q0
)
156 (state-add-link (fsa-end a
) 'eps q1
)
157 (make-fsa :start q0
:end q1
)))
159 (defun fsa-branch (&rest as
)
160 "Alternation of a0..an; Hence `a0 | a1 | ... | an'."
161 (let ((q0 (make-state))
164 (state-add-link q0
'eps
(fsa-start a
))
165 (state-add-link (fsa-end a
) 'eps q1
))
166 (make-fsa :start q0
:end q1
)))
168 ;;;; ----------------------------------------------------------------------------------------------------
169 ;;;; Converting regular expressions to (ND)FSA
172 ;;; However we choose here a Lispy syntax for regular expressions:
175 ;;; (and a0 .. an) concatation
176 ;;; (or a0 .. an) alternation
179 ;;; Further the abbrevs.:
180 ;;; (+ a) == (and a (* a))
181 ;;; (? a) == (or a (and))
182 ;;; (a0 ... an) == (and a0 ... an)
184 ;;; When a string embeded into a regular expression is seen, the list
185 ;;; of characters is spliced in. So formally:
186 ;;; (a0 .. ai "xyz" aj .. an) == (a0 .. ai #\x #\y #\z aj .. an)
188 ;;; This is useful for matching words:
189 ;;; "foo" --> (and "foo") --> (and #\f #\o #\o) == The word 'foo'
190 ;;; or for denoting small sets:
191 ;;; (or "+-") --> (or #\+ #\-) == One of '+' or '-'
193 (defun loose-eq (x y
)
195 ((and (symbolp x
) (symbolp y
))
196 (string= (symbol-name x
) (symbol-name y
)))))
198 (defun regexp->fsa
(term)
199 (setf term
(regexp-expand-splicing term
))
200 (cond ((and (atom term
) (not (stringp term
)))
202 ((loose-eq (car term
) 'RANGE
)
203 (fsa-trivial (cdr term
)))
204 ((loose-eq (car term
) 'AND
) (regexp/and-
>fsa term
))
205 ((loose-eq (car term
) 'OR
) (regexp/or-
>fsa term
))
206 ((loose-eq (car term
) '*) (fsa-iterate (regexp->fsa
(cadr term
))))
207 ((loose-eq (car term
) '+) (regexp->fsa
`(AND ,(cadr term
) (* ,(cadr term
)))))
208 ((loose-eq (car term
) '?
) (regexp->fsa
`(OR (AND) ,(cadr term
))))
210 (regexp->fsa
`(AND .
,term
))) ))
212 (defun regexp/or-
>fsa
(term)
213 ;; I optimize here a bit: I recognized, that ORs are mainly just
214 ;; (large) sets of characters. The extra epsilon transitions are not
215 ;; neccessary on single atoms, so I omit them here. -- This reduces the
216 ;; number of states quite a bit in the first place.
217 (let ((q0 (make-state))
219 (dolist (a (cdr term
))
221 (state-add-link q0 a q1
))
222 ((let ((a (regexp->fsa a
)))
223 (state-add-link q0
'eps
(fsa-start a
))
224 (state-add-link (fsa-end a
) 'eps q1
)))))
225 (make-fsa :start q0
:end q1
)))
227 (defun regexp/and-
>fsa
(term)
228 (cond ((null (cdr term
)) (fsa-empty))
229 ((null (cddr term
)) (regexp->fsa
(cadr term
)))
230 ((fsa-concat (regexp->fsa
(cadr term
)) (regexp->fsa
`(and .
,(cddr term
)))))))
232 (defun regexp-expand-splicing (term)
234 (mapcan #'(lambda (x)
235 (cond ((stringp x
) (coerce x
'list
))
240 ;;;; ----------------------------------------------------------------------------------------------------
241 ;;;; Converting a ND-FSA to a D-FSA
244 ;;; Since we have to compare and unionfy sets of states a lot, I use bit-vectors
245 ;;; to represent these sets for speed. However let me abstract that a bit:
247 ;;; (All these are defined as macros simply for speed. Inlining would be an
248 ;;; option here, when it would be reliable. With defining macros I enforce
251 (defmacro make-empty-set
(n)
252 "Create the empty set on the domain [0,n)."
253 `(make-array ,n
:element-type
'bit
:initial-element
0))
255 (defmacro nset-put
(bag new
)
256 "Destructively calculate bag = bag U {new}."
257 `(setf (sbit (the (simple-array bit
(*)) ,bag
) (the fixnum
,new
)) 1))
259 (defmacro element-of-set-p
(elm set
)
260 "Determine whether `elm' is element of the set `set'."
261 `(eq 1 (sbit (the (simple-array bit
(*)) ,set
) (the fixnum
,elm
))))
263 (defmacro set-size
(set)
264 "Return the upper bound of the domain of `set'."
267 (defmacro do-bits
((var set
&optional result
) &body body
)
268 "Iterate body with `var' over all elements of `set'."
269 (let ((g/set
(gensym)))
270 `(let ((,g
/set
,set
))
271 (dotimes (,var
(set-size ,g
/set
) ,result
)
272 (when (element-of-set-p ,var
,g
/set
)
275 ;;; Since the sets we defined above only take non-negative integers, we have to
276 ;;; number our states. This is done once by NUMBER-STATES.
278 (defun number-states (starts)
279 "Number all state reachable form `starts', continuosly from 0. Each state got
280 it's number stuck into the `id' slot.
281 Returns two values: `n' the number of states and `tab' a table to lookup a
282 state given the number it got attached to."
284 (tab (make-array 0 :adjustable t
:fill-pointer
0 :initial-element nil
)))
287 (vector-push-extend x tab
300)
288 (setf (state-id x
) (prog1 n
(incf n
)))
289 (dolist (tr (state-transitions x
))
291 (dolist (y (state-eps-transitions x
))
293 (dolist (s starts
) (walk s
))
296 ;;; We need to calculate the epsilon closure of a given state. Due to the
297 ;;; precise workings of our algorithm below, we only need this augmenting
300 (defun fsa-epsilon-closure/set
(x state-set
)
301 "Augment the epsilon closure of the state `state' into `state-set'."
302 (unless (element-of-set-p (state-id x
) state-set
)
303 (nset-put state-set
(state-id x
))
304 (dolist (k (state-eps-transitions x
))
305 (fsa-epsilon-closure/set k state-set
))))
307 (defun ndfsa->dfsa
(starts)
310 (multiple-value-bind (n tab
) (number-states starts
)
311 (labels ((name-state-set (state-set)
312 (or (cdr (assoc state-set known
:test
#'equal
))
313 (let ((new (make-state)))
314 (push (cons state-set new
) known
)
315 (push state-set batch
)
317 (add-state-set (state-set)
318 (let ((new-tr (make-hash-table :test
'equal
))
320 (name (name-state-set state-set
))
322 (do-bits (s0 state-set
)
323 (let ((s (aref tab s0
)))
324 (setf new-final
(max new-final
(state-final s
)))
325 (dolist (tr (state-transitions s
))
328 (let ((looked (gethash z new-tr
)))
330 (fsa-epsilon-closure/set to looked
)
331 (let ((sts (make-empty-set n
)))
332 (fsa-epsilon-closure/set to sts
)
333 (setf (gethash z new-tr
) sts
)))))))))
334 (do ((q (frob2 new-tr
) (cddr q
)))
338 (push (cons z
(name-state-set to
)) new-tr-real
)))
339 (setf (state-transitions name
) new-tr-real
340 (state-final name
) new-final
))))
342 (mapcar #'(lambda (s)
343 (name-state-set (let ((sts (make-empty-set n
)))
344 (fsa-epsilon-closure/set s sts
)
349 (add-state-set (pop batch
)))) ))))
351 (defun frob2 (res &aux res2
)
352 (maphash (lambda (z to
)
353 (do ((p res2
(cddr p
)))
355 (setf res2
(list* (list z
) to res2
)))
356 (when (equal to
(cadr p
))
357 (setf (car p
) (cons z
(car p
)))
362 ;;;; ----------------------------------------------------------------------------------------------------
366 ;;; Features to think about:
367 ;;; - case insensitive scanner
368 ;;; - compression of tables
370 ;;; - non-interactive high speed scanning?
371 ;;; - make BAG a macro? So that non used bags are not considered?
373 ;;; - support for include?
374 ;;; - support for putting back input?
375 ;;; - count lines/columns? Track source?
376 ;;; - richer set of regexp primitives e.g. "[a-z]" style sets
377 ;;; - could we offer complement regexp?
378 ;;; - trailing context
379 ;;; - sub-state stacks?
380 ;;; - user variables to include ['global' / 'lexical']
381 ;;; - identifing sub-expression of regexps (ala \(..\) and \n)
385 (defun loadable-states-form (starts)
389 ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die
390 ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-(
391 ;; Deswegen hier dieser read-from-string Hack.
392 (defun loadable-states-form (starts)
393 `(LET ((*PACKAGE
* (FIND-PACKAGE ',(package-name *package
*))))
394 (READ-FROM-STRING ',(let ((*print-circle
* t
)
396 (*print-pretty
* nil
))
397 (prin1-to-string starts
)))))
399 ;;;; ----------------------------------------------------------------------------------------------------
402 (defun parse-char-set (string i
)
406 ;;the first char is special
407 (cond ((char= (char string i
) #\
]) (incf i
) (push #\
] res
))
408 ((char= (char string i
) #\^
) (incf i
) (setq complement-p t
))
409 ((char= (char string i
) #\-
) (incf i
) (push #\- res
)))
411 ((char= (char string i
) #\
])
412 (values (if complement-p
(cons 'cset res
) (cons 'set res
)) (+ i
1)))
413 (cond ((char= (char string
(+ i
1)) #\-
)
415 (push (cons (char string i
) (char string
(+ i
2))) res
)
419 (push (char string i
) res
)
422 ;;;; ------------------------------------------------------------------------------------------
424 (defparameter *full-table-p
* nil
)
426 (defun mungle-transitions (trs)
428 (let ((res (make-array 256 :initial-element nil
)))
430 (dolist (range (car tr
))
432 for code from
(car range
) to
(cadr range
)
433 do
(setf (aref res code
) (cdr tr
)))))
437 (defun over-all-states (fun starts
)
438 ;; Apply `fun' to each state reachable from starts.
441 (unless (member q yet
)
443 (let ((trs (state-transitions q
)))
447 (mapc #'walk starts
))))
449 (defmacro deflexer
(name macro-defs
&rest rule-defs
)
450 (let ((macros nil
) starts clauses
(n-fin 0))
451 (dolist (k macro-defs
)
452 (push (cons (car k
) (sublis macros
(cadr k
))) macros
))
453 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
455 (mapcar #'(lambda (x)
456 (cond ((and (consp (car x
)) (string-equal (caar x
) :in
))
457 (list (cadar x
) (sublis macros
(caddar x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))
458 ((list 'initial
(sublis macros
(car x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))))
459 (reverse rule-defs
)))
460 ;;collect all start states in alist (<name> . <state>)
461 (setq starts
(mapcar #'(lambda (name)
462 (cons name
(make-state)))
463 (remove-duplicates (mapcar #'car rule-defs
))))
465 (dolist (r rule-defs
)
466 (destructuring-bind (start expr end action
) r
467 (let ((q0 (cdr (assoc start starts
)))
468 (fsa (regexp->fsa
`(and ,expr
))))
470 (state-add-link q0
'eps
(fsa-start fsa
))
472 (setf (state-final (fsa-end fsa
)) end
)
473 ;; build a clause for CASE
474 (push `((,end
) .
,action
) clauses
))))
475 ;; hmm... we have to sort the final states after building the dfsa
476 ;; or introduce fixnum identifier and instead of union take the minimum
477 ;; above in ndfsa->dfsa.
479 (mapcar #'(lambda (x y
) (setf (cdr x
) y
))
480 starts
(ndfsa->dfsa
(mapcar #'cdr starts
))))
481 ;;(terpri)(princ `(,(number-states starts) states))(finish-output)
483 (over-all-states (lambda (state)
485 (setf (state-transitions state
)
486 (mungle-transitions (state-transitions state
))))
487 (mapcar #'cdr starts
))
488 (format T
"~&~D states." n
))
489 `(DEFUN ,(intern (format nil
"MAKE-~A-LEXER" name
)) (INPUT)
490 (LET* ((STARTS ,(loadable-states-form starts
))
494 (BAGG/CH
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
495 :ELEMENT-TYPE
'CHARACTER
))
496 (BAGG/STATE
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
))
504 (WHEN (> (LENGTH CH
) 0)
505 (PUSH (CONS 0 CH
) LOOK-AHEAD
)))
506 (T (PUSH CH LOOK-AHEAD
))))
508 (VECTOR-PUSH-EXTEND CH BAGG
/CH
10)
509 (VECTOR-PUSH-EXTEND STATE BAGG
/STATE
10) )
511 (LET ((FP (LENGTH BAGG
/CH
)))
512 (PROG1 (CHAR BAGG
/CH
(1- FP
))
513 (SETF (FILL-POINTER BAGG
/STATE
) (1- FP
))
514 (SETF (FILL-POINTER BAGG
/CH
) (1- FP
)))))
516 (AREF BAGG
/STATE
(1- (LENGTH BAGG
/STATE
))) )
518 (= (LENGTH BAGG
/CH
) 0))
520 (SETF (FILL-POINTER BAGG
/CH
) 0)
521 (SETF (FILL-POINTER BAGG
/STATE
) 0) )
525 (COND ((NULL LOOK-AHEAD
) (READ-CHAR INPUT NIL NIL
))
526 ((CONSP (CAR LOOK-AHEAD
))
527 (LET ((S (CDAR LOOK-AHEAD
)))
529 (CHAR S
(CAAR LOOK-AHEAD
))
530 (INCF (CAAR LOOK-AHEAD
))
531 (WHEN (= (CAAR LOOK-AHEAD
) (LENGTH S
))
533 (T (POP LOOK-AHEAD
)) ))
535 `(FIND-NEXT-STATE (STATE CH
)
537 (SVREF (STATE-TRANSITIONS STATE
) (CHAR-CODE CH
))
539 `(FIND-NEXT-STATE (STATE CH
)
542 (DOLIST (K (STATE-TRANSITIONS STATE
))
544 (WHEN (<= (CAR Q
) (CHAR-CODE CH
) (CADR q
))
545 (RETURN-FROM FOO
(CDR K
))))))))) )
546 (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE
)
549 START
(SETQ STATE
(CDR (ASSOC SUB-STATE STARTS
)))
551 (ERROR "Sub-state ~S is not defined." SUB-STATE
))
553 LOOP
(SETQ CH
(GETCH))
554 (LET ((NEXT-STATE (FIND-NEXT-STATE STATE CH
)) )
555 (COND ((NULL NEXT-STATE
)
558 ((OR (EMPTY*?
) (NOT (EQ 0 (TOS*/STATE
)))))
560 (COND ((AND (EMPTY*?
) (NULL CH
))
563 (ERROR "oops at ~A: ~S ~S"
564 (file-position (cxml-rng::stream-source INPUT
))
566 (mapcar #'car
(state-transitions state
))))
568 (LET ((HALTING-STATE (TOS*/STATE
)))
570 (SYMBOL-MACROLET ((BAG (IF BAG
*
572 (SETF BAG
* (STRING*)))))
577 (PUSH* CH
(STATE-FINAL NEXT-STATE
))
578 (SETQ STATE NEXT-STATE
)