1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; -*-
2 ;;; ---------------------------------------------------------------------------
4 ;;; Created: 1999-07-17
5 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
6 ;;; Author: Henrik Motakef <hmot@henrik-motakef.de>
7 ;;; Author: David Lichteblau <david@lichteblau.com>
8 ;;; License: Lisp-LGPL (See file COPYING for details).
9 ;;; ---------------------------------------------------------------------------
10 ;;; (c) copyright 1999 by Gilbert Baumann
11 ;;; (c) copyright 2003 by Henrik Motakef
12 ;;; (c) copyright 2004 knowledgeTools Int. GmbH
13 ;;; (c) copyright 2004 David Lichteblau
14 ;;; (c) copyright 2005 David Lichteblau
16 ;;; This library is free software; you can redistribute it and/or
17 ;;; modify it under the terms of the GNU Library General Public
18 ;;; License as published by the Free Software Foundation; either
19 ;;; version 2 of the License, or (at your option) any later version.
21 ;;; This library is distributed in the hope that it will be useful,
22 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 ;;; Library General Public License for more details.
26 ;;; You should have received a copy of the GNU Library General Public
27 ;;; License along with this library; if not, write to the
28 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
29 ;;; Boston, MA 02111-1307 USA.
35 ;; For reading runes, I defined my own streams, called xstreams,
36 ;; because we want to be fast. A function call or even a method call
37 ;; per character is not acceptable, instead of that we define a
38 ;; buffered stream with and advertised buffer layout, so that we
39 ;; could use the trick stdio uses: READ-RUNE and PEEK-RUNE are macros,
40 ;; directly accessing the buffer and only calling some underflow
41 ;; handler in case of stream underflows. This will yield to quite a
42 ;; performance boost vs calling READ-BYTE per character.
44 ;; Also we need to do encoding t conversion on ; this better done at large chunks of data rather than on a character
45 ;; by character basis. This way we need a dispatch on the active
46 ;; encoding only once in a while, instead of for each character. This
47 ;; allows us to use a CLOS interface to do the underflow handling.
51 ;; Now, for reading tokens, we define another kind of streams, called
52 ;; zstreams. These zstreams also maintain an input stack to implement
53 ;; inclusion of external entities. This input stack contains xstreams
54 ;; or the special marker :STOP. Such a :STOP marker indicates, that
55 ;; input should not continue there, but well stop; that is simulate an
56 ;; EOF. The user is then responsible to pop this marker off the input
59 ;; This input stack is also used to detect circular entity inclusion.
61 ;; The zstream tokenizer recognizes the following types of tokens and
62 ;; is controlled by the *DATA-BEHAVIOUR* flag. (Which should become a
63 ;; slot of zstreams instead).
66 ;; :xml-decl (<target> . <content>) ;processing-instruction starting with "<?xml"
67 ;; :pi (<target> . <content>) ;processing-instruction
68 ;; :stag (<name> . <atts>) ;start tag
69 ;; :etag (<name> . <atts>) ;end tag
70 ;; :ztag (<name> . <atts>) ;empty tag
79 ;; *data-behaviour* = :DTD
81 ;; :nmtoken <interned-rod>
87 ;; :\[ :\] :\( :\) :|\ :\> :\" :\' :\, :\? :\* :\+
89 ;; *data-behaviour* = :DOC
91 ;; :entity-ref <interned-rod>
97 ;; o provide for a faster DOM
99 ;; o morph zstream into a context object and thus also get rid of
100 ;; special variables. Put the current DTD there too.
103 ;; o the *scratch-pad* hack should become something much more
104 ;; reentrant, we could either define a system-wide resource
105 ;; or allocate some scratch-pads per context.
106 ;; [for thread-safety reasons the array are allocated per context now,
107 ;; reentrancy is still open]
109 ;; o CR handling in utf-16 deocders
113 ;; o max depth together with circle detection
114 ;; (or proof, that our circle detection is enough).
115 ;; [gemeint ist zstream-push--david]
117 ;; o better extensibility wrt character representation, one may want to
119 ;; - UCS-4 in vectoren
121 ;; o xstreams auslagern, documententieren und dann auch in SGML und
122 ;; CSS parser verwenden. (halt alles was zeichen liest).
123 ;; [ausgelagert sind sie; dokumentiert "so la la"; die Reintegration
124 ;; in Closure ist ein ganz anderes Thema]
126 ;; o recording of source locations for nodes.
128 ;; o based on the DTD and xml:space attribute implement HTML white
131 ;; o on a parser option, do not expand external entities.
133 ;;;; Validity constraints:
134 ;;;; (00) Root Element Type like (03), c.f. MAKE-ROOT-MODEL
135 ;;;; (01) Proper Declaration/PE Nesting P/MARKUP-DECL
136 ;;;; (02) Standalone Document Declaration all over the place [*]
137 ;;;; (03) Element Valid VALIDATE-*-ELEMENT, -CHARACTERS
138 ;;;; (04) Attribute Value Type VALIDATE-ATTRIBUTE
139 ;;;; (05) Unique Element Type Declaration DEFINE-ELEMENT
140 ;;;; (06) Proper Group/PE Nesting P/CSPEC
141 ;;;; (07) No Duplicate Types LEGAL-CONTENT-MODEL-P
142 ;;;; (08) ID VALIDATE-ATTRIBUTE
143 ;;;; (09) One ID per Element Type DEFINE-ATTRIBUTE
144 ;;;; (10) ID Attribute Default DEFINE-ATTRIBUTE
145 ;;;; (11) IDREF VALIDATE-ATTRIBUTE, P/DOCUMENT
146 ;;;; (12) Entity Name VALIDATE-ATTRIBUTE
147 ;;;; (13) Name Token VALIDATE-ATTRIBUTE
148 ;;;; (14) Notation Attributes VALIDATE-ATTRIBUTE, P/ATT-TYPE
149 ;;;; (15) One Notation Per Element Type DEFINE-ATTRIBUTE
150 ;;;; (16) No Notation on Empty Element DEFINE-ELEMENT, -ATTRIBUTE
151 ;;;; (17) Enumeration VALIDATE-ATTRIBUTE
152 ;;;; (18) Required Attribute PROCESS-ATTRIBUTES
153 ;;;; (19) Attribute Default Legal DEFINE-ATTRIBUTE
154 ;;;; (20) Fixed Attribute Default VALIDATE-ATTRIBUTE
155 ;;;; (21) Proper Conditional Section/PE Nesting P/CONDITIONAL-SECT, ...
156 ;;;; (22) Entity Declared [**]
157 ;;;; (23) Notation Declared P/ENTITY-DEF, P/DOCUMENT
158 ;;;; (24) Unique Notation Name DEFINE-NOTATION
160 ;;;; [*] Perhaps we could revert the explicit checks of (02), if we did
161 ;;;; _not_ read external subsets of standalone documents when parsing in
162 ;;;; validating mode. Violations of VC (02) constraints would then appear as
163 ;;;; wellformedness violations, right?
165 ;;;; [**] Although I haven't investigated this properly yet, I believe that
166 ;;;; we check this VC together with the WFC even in non-validating mode.
171 (setf (excl:named-readtable
:runes
) *readtable
*)
173 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
174 (defparameter *fast
* '(optimize (speed 3) (safety 0)))
175 ;;(defparameter *fast* '(optimize (speed 2) (safety 3)))
182 (defstruct (context (:conc-name nil
))
186 ;; xml:base machen wir fuer klacks mal gleich als expliziten stack:
188 (referenced-notations '())
189 (id-table (%make-rod-hash-table
))
190 ;; FIXME: Wofuer ist name-hashtable da? Will man das wissen?
191 (name-hashtable (make-rod-hashtable :size
2000))
193 (entity-resolver nil
)
194 (disallow-internal-subset nil
)
197 (defvar *expand-pe-p
* nil
)
199 (defparameter *initial-namespace-bindings
*
201 (#"xmlns" .
#"http://www.w3.org/2000/xmlns/")
202 (#"xml" .
#"http://www.w3.org/XML/1998/namespace")))
204 (defparameter *namespace-bindings
* *initial-namespace-bindings
*)
206 ;;;; ---------------------------------------------------------------------------
211 (defstruct (stream-name
212 (:print-function print-stream-name
))
217 (defun print-stream-name (object stream depth
)
218 (declare (ignore depth
))
219 (format stream
"[~A ~S ~A]"
220 (rod-string (stream-name-entity-name object
))
221 (stream-name-entity-kind object
)
222 (stream-name-uri object
)))
224 (deftype read-element
() 'rune
)
226 (defun call-with-open-xstream (fn stream
)
229 (close-xstream stream
)))
231 (defmacro with-open-xstream
((var value
) &body body
)
232 `(call-with-open-xstream (lambda (,var
) ,@body
) ,value
))
234 (defun call-with-open-xfile (continuation &rest open-args
)
235 (let ((input (apply #'open
(car open-args
) :element-type
'(unsigned-byte 8) (cdr open-args
))))
238 (funcall continuation
(make-xstream input
)))
241 (defmacro with-open-xfile
((stream &rest open-args
) &body body
)
242 `(call-with-open-xfile (lambda (,stream
) .
,body
) .
,open-args
))
244 ;;;; -------------------------------------------------------------------
245 ;;;; Rechnen mit Runen
248 ;; Let us first define fast fixnum arithmetric get rid of type
249 ;; checks. (After all we know what we do here).
251 (defmacro fx-op
(op &rest xs
)
252 `(the fixnum
(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
))))
253 (defmacro fx-pred
(op &rest xs
)
254 `(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
)))
256 (defmacro %
+ (&rest xs
) `(fx-op + ,@xs
))
257 (defmacro %-
(&rest xs
) `(fx-op -
,@xs
))
258 (defmacro %
* (&rest xs
) `(fx-op * ,@xs
))
259 (defmacro %
/ (&rest xs
) `(fx-op floor
,@xs
))
260 (defmacro %and
(&rest xs
) `(fx-op logand
,@xs
))
261 (defmacro %ior
(&rest xs
) `(fx-op logior
,@xs
))
262 (defmacro %xor
(&rest xs
) `(fx-op logxor
,@xs
))
263 (defmacro %ash
(&rest xs
) `(fx-op ash
,@xs
))
264 (defmacro %mod
(&rest xs
) `(fx-op mod
,@xs
))
266 (defmacro %
= (&rest xs
) `(fx-pred = ,@xs
))
267 (defmacro %
<= (&rest xs
) `(fx-pred <= ,@xs
))
268 (defmacro %
>= (&rest xs
) `(fx-pred >= ,@xs
))
269 (defmacro %
< (&rest xs
) `(fx-pred < ,@xs
))
270 (defmacro %
> (&rest xs
) `(fx-pred > ,@xs
))
272 ;;; XXX Geschwindigkeit dieser Definitionen untersuchen!
274 (defmacro rune-op
(op &rest xs
)
275 `(code-rune (,op
,@(mapcar (lambda (x) `(rune-code ,x
)) xs
))))
276 (defmacro rune-pred
(op &rest xs
)
277 `(,op
,@(mapcar (lambda (x) `(rune-code ,x
)) xs
)))
279 (defmacro %rune
+ (&rest xs
) `(rune-op + ,@xs
))
280 (defmacro %rune-
(&rest xs
) `(rune-op -
,@xs
))
281 (defmacro %rune
* (&rest xs
) `(rune-op * ,@xs
))
282 (defmacro %rune
/ (&rest xs
) `(rune-op floor
,@xs
))
283 (defmacro %rune-and
(&rest xs
) `(rune-op logand
,@xs
))
284 (defmacro %rune-ior
(&rest xs
) `(rune-op logior
,@xs
))
285 (defmacro %rune-xor
(&rest xs
) `(rune-op logxor
,@xs
))
286 (defmacro %rune-ash
(a b
) `(code-rune (ash (rune-code ,a
) ,b
)))
287 (defmacro %rune-mod
(&rest xs
) `(rune-op mod
,@xs
))
289 (defmacro %rune
= (&rest xs
) `(rune-pred = ,@xs
))
290 (defmacro %rune
<= (&rest xs
) `(rune-pred <= ,@xs
))
291 (defmacro %rune
>= (&rest xs
) `(rune-pred >= ,@xs
))
292 (defmacro %rune
< (&rest xs
) `(rune-pred < ,@xs
))
293 (defmacro %rune
> (&rest xs
) `(rune-pred > ,@xs
))
295 ;;;; ---------------------------------------------------------------------------
299 ;;; make-rod-hashtable
300 ;;; rod-hash-get hashtable rod &optional start end -> value ; successp
301 ;;; (setf (rod-hash-get hashtable rod &optional start end) new-value
304 (defstruct (rod-hashtable (:constructor make-rod-hashtable
/low
))
309 (defun make-rod-hashtable (&key
(size 200))
310 (setf size
(nearest-greater-prime size
))
311 (make-rod-hashtable/low
313 :table
(make-array size
:initial-element nil
)))
315 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
316 (defconstant +fixnum-bits
+
317 (1- (integer-length most-positive-fixnum
))
318 "Pessimistic approximation of the number of bits of fixnums.")
320 (defconstant +fixnum-mask
+
321 (1- (expt 2 +fixnum-bits
+))
322 "Pessimistic approximation of the largest bit-mask, still being a fixnum."))
324 (definline stir
(a b
)
326 (%xor
(%ior
(%ash
(%and a
#.
(ash +fixnum-mask
+ -
5)) 5)
327 (%ash a
#.
(- 5 +fixnum-bits
+)))
330 (definline rod-hash
(rod start end
)
331 "Compute a hash code out of a rod."
332 (let ((res (%- end start
)))
333 (do ((i start
(%
+ i
1)))
335 (declare (type fixnum i
))
336 (setf res
(stir res
(rune-code (%rune rod i
)))))
339 (definline rod
=* (x y
&key
(start1 0) (end1 (length x
))
340 (start2 0) (end2 (length y
)))
341 (and (%
= (%- end1 start1
) (%- end2 start2
))
342 (do ((i start1
(%
+ i
1))
346 (unless (rune= (%rune x i
) (%rune y j
))
349 (definline rod
=** (x y start1 end1 start2 end2
)
350 (and (%
= (%- end1 start1
) (%- end2 start2
))
351 (do ((i start1
(%
+ i
1))
355 (unless (rune= (%rune x i
) (%rune y j
))
358 (defun rod-hash-get (hashtable rod
&optional
(start 0) (end (length rod
)))
359 (declare (type (simple-array rune
(*)) rod
))
360 (let ((j (%mod
(rod-hash rod start end
)
361 (rod-hashtable-size hashtable
))))
362 (dolist (q (svref (rod-hashtable-table hashtable
) j
)
363 (values nil nil nil
))
364 (declare (type cons q
))
365 (when (rod=** (car q
) rod
0 (length (the (simple-array rune
(*)) (car q
))) start end
)
366 (return (values (cdr q
) t
(car q
)))))))
368 (defun rod-hash-set (new-value hashtable rod
&optional
(start 0) (end (length rod
)))
369 (let ((j (%mod
(rod-hash rod start end
)
370 (rod-hashtable-size hashtable
)))
372 (dolist (q (svref (rod-hashtable-table hashtable
) j
)
374 (setf key
(rod-subseq* rod start end
))
375 (push (cons key new-value
)
376 (aref (rod-hashtable-table hashtable
) j
))))
377 (when (rod=* (car q
) rod
:start2 start
:end2 end
)
379 (setf (cdr q
) new-value
)
381 (values new-value key
)))
384 (defun rod-subseq* (source start
&optional
(end (length source
)))
385 (unless (and (typep start
'fixnum
) (>= start
0))
386 (error "~S is not a non-negative fixnum." start
))
387 (unless (and (typep end
'fixnum
) (>= end start
))
388 (error "END argument, ~S, is not a fixnum no less than START, ~S." end start
))
389 (when (> start
(length source
))
390 (error "START argument, ~S, should be no greater than length of rod." start
))
391 (when (> end
(length source
))
392 (error "END argument, ~S, should be no greater than length of rod." end
))
394 (declare (type fixnum start end
))
395 (let ((res (make-rod (- end start
))))
396 (declare (type rod res
))
397 (do ((i (- (- end start
) 1) (the fixnum
(- i
1))))
399 (declare (type fixnum i
))
400 (setf (%rune res i
) (aref source
(the fixnum
(+ i start
))))))))
403 (defun rod-subseq* (source start
&optional
(end (length source
)))
404 (subseq source start end
))
406 (deftype ufixnum
() `(unsigned-byte ,(integer-length most-positive-fixnum
)))
409 (defun rod-subseq** (source start
&optional
(end (length source
)))
410 (declare (type (simple-array rune
(*)) source
)
413 (optimize (speed 3) (safety 0)))
414 (let ((res (make-array (%- end start
) :element-type
'rune
)))
415 (declare (type (simple-array rune
(*)) res
))
416 (let ((i (%- end start
)))
417 (declare (type ufixnum i
))
422 (setf (%rune res i
) (%rune source
(the ufixnum
(+ i start
))))))
426 (defun rod-subseq** (source start
&optional
(end (length source
)))
427 (subseq source start end
))
429 (defun (setf rod-hash-get
) (new-value hashtable rod
&optional
(start 0) (end (length rod
)))
430 (rod-hash-set new-value hashtable rod start end
))
432 (defun intern-name (rod &optional
(start 0) (end (length rod
)))
433 (multiple-value-bind (value successp key
) (rod-hash-get (name-hashtable *ctx
*) rod start end
)
434 (declare (ignore value
))
437 (nth-value 1 (rod-hash-set t
(name-hashtable *ctx
*) rod start end
)))))
439 ;;;; ---------------------------------------------------------------------------
444 (defvar *scratch-pad
*)
445 (defvar *scratch-pad-2
*)
446 (defvar *scratch-pad-3
*)
447 (defvar *scratch-pad-4
*)
449 (declaim (type (simple-array rune
(*))
450 *scratch-pad
* *scratch-pad-2
* *scratch-pad-3
* *scratch-pad-4
*))
452 (defmacro with-scratch-pads
((&optional
) &body body
)
453 `(let ((*scratch-pad
* (make-array 1024 :element-type
'rune
))
454 (*scratch-pad-2
* (make-array 1024 :element-type
'rune
))
455 (*scratch-pad-3
* (make-array 1024 :element-type
'rune
))
456 (*scratch-pad-4
* (make-array 1024 :element-type
'rune
)))
459 (defmacro %put-unicode-char
(code-var put
)
461 (cond #+rune-is-utf-16
462 ((%
> ,code-var
#xFFFF
)
463 (,put
(the rune
(code-rune (%
+ #xD7C0
(%ash
,code-var -
10)))))
464 (,put
(the rune
(code-rune (%ior
#xDC00
(%and
,code-var
#x03FF
))))))
466 (,put
(code-rune ,code-var
))))))
468 (defun adjust-array-by-copying (old-array new-size
)
469 "Adjust an array by copying and thus ensures, that result is a SIMPLE-ARRAY."
470 (let ((res (make-array new-size
:element-type
(array-element-type old-array
))))
471 (replace res old-array
472 :start1
0 :end1
(length old-array
)
473 :start2
0 :end2
(length old-array
))
476 (defmacro with-rune-collector-aux
(scratch collect body mode
)
481 `(let ((,n
(length ,scratch
))
484 (declare (type fixnum
,n
,i
))
490 (when (%
>= ,',i
,',n
)
491 (setf ,',n
(* 2 ,',n
))
494 (adjust-array-by-copying ,',scratch
,',n
))))
495 (setf (aref (the (simple-array rune
(*)) ,',b
) ,',i
) x
)
501 `(intern-name ,b
0 ,i
))
503 `(let ((,rod
(make-rod ,i
)))
504 (while (not (%
= ,i
0))
506 (setf (%rune
,rod
,i
)
507 (aref (the (simple-array rune
(*)) ,b
) ,i
)))
513 '(defmacro with-rune-collector-aux
(scratch collect body mode
)
518 `(let ((,n
(length ,scratch
))
520 (declare (type fixnum
,n
,i
))
526 (when (%
>= ,',i
,',n
)
527 (setf ,',n
(* 2 ,',n
))
530 (adjust-array-by-copying ,',scratch
,',n
))))
531 (setf (aref (the (simple-array rune
(*)) ,',scratch
) ,',i
) x
)
537 `(intern-name ,scratch
0 ,i
))
539 `(let ((,rod
(make-rod ,i
)))
542 (setf (%rune
,rod
,i
)
543 (aref (the (simple-array rune
(*)) ,scratch
) ,i
)))
546 `(values ,scratch
0 ,i
))
549 (defmacro with-rune-collector
((collect) &body body
)
550 `(with-rune-collector-aux *scratch-pad
* ,collect
,body
:copy
))
552 (defmacro with-rune-collector-2
((collect) &body body
)
553 `(with-rune-collector-aux *scratch-pad-2
* ,collect
,body
:copy
))
555 (defmacro with-rune-collector-3
((collect) &body body
)
556 `(with-rune-collector-aux *scratch-pad-3
* ,collect
,body
:copy
))
558 (defmacro with-rune-collector-4
((collect) &body body
)
559 `(with-rune-collector-aux *scratch-pad-4
* ,collect
,body
:copy
))
561 (defmacro with-rune-collector
/intern
((collect) &body body
)
562 `(with-rune-collector-aux *scratch-pad
* ,collect
,body
:intern
))
564 (defmacro with-rune-collector
/raw
((collect) &body body
)
565 `(with-rune-collector-aux *scratch-pad
* ,collect
,body
:raw
))
568 (defmacro while-reading-runes
((reader stream-in
) &rest body
)
569 ;; Thou shalt not leave body via a non local exit
570 (let ((stream (make-symbol "STREAM"))
571 (rptr (make-symbol "RPTR"))
572 (fptr (make-symbol "FPTR"))
573 (buf (make-symbol "BUF")) )
574 `(let* ((,stream
,stream-in
)
575 (,rptr
(xstream-read-ptr ,stream
))
576 (,fptr
(xstream-fill-ptr ,stream
))
577 (,buf
(xstream-buffer ,stream
)))
578 (declare (type fixnum
,rptr
,fptr
)
579 (type xstream
,stream
))
580 (macrolet ((,reader
(res-var)
581 `(cond ((%
= ,',rptr
,',fptr
)
582 (setf (xstream-read-ptr ,',stream
) ,',rptr
)
583 (setf ,res-var
(xstream-underflow ,',stream
))
584 (setf ,',rptr
(xstream-read-ptr ,',stream
))
585 (setf ,',fptr
(xstream-fill-ptr ,',stream
))
586 (setf ,',buf
(xstream-buffer ,',stream
)))
589 (aref (the (simple-array read-element
(*)) ,',buf
)
590 (the fixnum
,',rptr
)))
591 (setf ,',rptr
(%
+ ,',rptr
1))))))
594 (setf (xstream-read-ptr ,stream
) ,rptr
) )))))
597 ;;;; ---------------------------------------------------------------------------
601 (define-condition xml-parse-error
(simple-error) ()
603 "Superclass of all conditions signalled by the CXML parser."))
605 (define-condition well-formedness-violation
(xml-parse-error) ()
607 "This condition is signalled for all well-formedness violations.
609 Note for validating mode: Sometimes violations of well-formedness are
610 first detected as validity errors by the parser and signalled as
611 instances of @class{validity-error} rather
612 than well-formedness-violation."))
614 (define-condition validity-error
(xml-parse-error) ()
616 "Reports the violation of a validity constraint."))
618 ;; We make some effort to signal end of file as a special condition, but we
619 ;; don't actually try very hard. Not sure whether we should. Right now I
620 ;; would prefer not to document this class.
621 (define-condition end-of-xstream
(well-formedness-violation) ())
623 (defun describe-xstream (x s
)
624 (format s
" Line ~D, column ~D in ~A~%"
625 (xstream-line-number x
)
626 (xstream-column-number x
)
627 (let ((name (xstream-name x
)))
630 "<anonymous stream>")
631 ((eq :main
(stream-name-entity-kind name
))
632 (stream-name-uri name
))
636 (defun %error
(class stream message
)
637 (let* ((zmain (if *ctx
* (main-zstream *ctx
*) nil
))
638 (zstream (if (zstream-p stream
) stream zmain
))
639 (xstream (if (xstream-p stream
) stream nil
))
640 (s (make-string-output-stream)))
641 (write-line message s
)
643 (write-line "Location:" s
)
644 (describe-xstream xstream s
))
647 (remove xstream
(remove :stop
(zstream-input-stack zstream
)))))
649 (write-line "Context:" s
)
651 (describe-xstream x s
)))))
652 (when (and zmain
(not (eq zstream zmain
)))
654 (remove xstream
(remove :stop
(zstream-input-stack zmain
)))))
656 (write-line "Context in main document:" s
)
658 (describe-xstream x s
)))))
661 :format-arguments
(list (get-output-stream-string s
)))))
663 (defun validity-error (fmt &rest args
)
664 (%error
'validity-error
666 (format nil
"Document not valid: ~?" fmt args
)))
668 (defun wf-error (stream fmt
&rest args
)
669 (%error
'well-formedness-violation
671 (format nil
"Document not well-formed: ~?" fmt args
)))
673 (defun eox (stream &optional x
&rest args
)
674 (%error
'end-of-xstream
676 (format nil
"End of file~@[: ~?~]" x args
)))
678 (defclass cxml-parser
(sax:sax-parser
) ((ctx :initarg
:ctx
)))
680 (defun parser-xstream (parser)
681 (car (zstream-input-stack (main-zstream (slot-value parser
'ctx
)))))
683 (defun parser-stream-name (parser)
684 (let ((xstream (parser-xstream parser
)))
686 (xstream-name xstream
)
689 (defmethod sax:line-number
((parser cxml-parser
))
690 (let ((x (parser-xstream parser
)))
692 (xstream-line-number x
)
695 (defmethod sax:column-number
((parser cxml-parser
))
696 (let ((x (parser-xstream parser
)))
698 (xstream-column-number x
)
701 (defmethod sax:system-id
((parser cxml-parser
))
702 (let ((name (parser-stream-name parser
)))
704 (stream-name-uri name
)
707 (defmethod sax:xml-base
((parser cxml-parser
))
708 (let ((uri (car (base-stack (slot-value parser
'ctx
)))))
709 (if (or (null uri
) (stringp uri
))
711 (puri:render-uri uri nil
))))
713 (defvar *validate
* t
)
714 (defvar *external-subset-p
* nil
)
716 (defun validate-start-element (ctx name
)
718 (let* ((pair (car (model-stack ctx
)))
719 (newval (funcall (car pair
) name
)))
721 (validity-error "(03) Element Valid: ~A" (rod-string name
)))
722 (setf (car pair
) newval
)
723 (let ((e (find-element name
(dtd ctx
))))
725 (validity-error "(03) Element Valid: no definition for ~A"
727 (maybe-compile-cspec e
)
728 (push (copy-cons (elmdef-compiled-cspec e
)) (model-stack ctx
))))))
731 (cons (car x
) (cdr x
)))
733 (defun validate-end-element (ctx name
)
735 (let ((pair (car (model-stack ctx
))))
736 (unless (eq (funcall (car pair
) nil
) t
)
737 (validity-error "(03) Element Valid: ~A" (rod-string name
)))
738 (pop (model-stack ctx
)))))
740 (defun validate-characters (ctx rod
)
742 (let ((pair (car (model-stack ctx
))))
743 (unless (funcall (cdr pair
) rod
)
744 (validity-error "(03) Element Valid: unexpected PCDATA")))))
746 (defun standalone-check-necessary-p (def)
750 (elmdef (elmdef-external-p def
))
751 (attdef (attdef-external-p def
)))))
753 ;; attribute validation, defaulting, and normalization -- except for for
754 ;; uniqueness checks, which are done after namespaces have been declared
755 (defun process-attributes (ctx name attlist
)
756 (let ((e (find-element name
(dtd ctx
))))
759 (dolist (ad (elmdef-attributes e
)) ;handle default values
760 (unless (get-attribute (attdef-name ad
) attlist
)
761 (case (attdef-default ad
)
765 (validity-error "(18) Required Attribute: ~S not specified"
766 (rod-string (attdef-name ad
)))))
768 (when (standalone-check-necessary-p ad
)
769 (validity-error "(02) Standalone Document Declaration: missing attribute value"))
770 (push (sax:make-attribute
:qname
(attdef-name ad
)
771 :value
(cadr (attdef-default ad
))
774 (dolist (a attlist
) ;normalize non-CDATA values
775 (let* ((qname (sax:attribute-qname a
))
776 (adef (find-attribute e qname
)))
778 (when (and *validate
*
779 sax
:*namespace-processing
*
780 (eq (attdef-type adef
) :ID
)
781 (find #/: (sax:attribute-value a
)))
782 (validity-error "colon in ID attribute"))
783 (unless (eq (attdef-type adef
) :CDATA
)
784 (let ((canon (canon-not-cdata-attval (sax:attribute-value a
))))
785 (when (and (standalone-check-necessary-p adef
)
786 (not (rod= (sax:attribute-value a
) canon
)))
787 (validity-error "(02) Standalone Document Declaration: attribute value not normalized"))
788 (setf (sax:attribute-value a
) canon
))))))
789 (when *validate
* ;maybe validate attribute values
791 (validate-attribute ctx e a
))))
792 ((and *validate
* attlist
)
793 (validity-error "(04) Attribute Value Type: no definition for element ~A"
794 (rod-string name
)))))
797 (defun get-attribute (name attributes
)
798 (member name attributes
:key
#'sax
:attribute-qname
:test
#'rod
=))
800 (defun validate-attribute (ctx e a
)
801 (when (sax:attribute-specified-p a
) ;defaults checked by DEFINE-ATTRIBUTE
802 (let* ((qname (sax:attribute-qname a
))
804 (or (find-attribute e qname
)
805 (validity-error "(04) Attribute Value Type: not declared: ~A"
806 (rod-string qname
)))))
807 (validate-attribute* ctx adef
(sax:attribute-value a
)))))
809 (defun validate-attribute* (ctx adef value
)
810 (let ((type (attdef-type adef
))
811 (default (attdef-default adef
)))
812 (when (and (listp default
)
813 (eq (car default
) :FIXED
)
814 (not (rod= value
(cadr default
))))
815 (validity-error "(20) Fixed Attribute Default: expected ~S but got ~S"
816 (rod-string (cadr default
))
818 (ecase (if (listp type
) (car type
) type
)
820 (unless (valid-name-p value
)
821 (validity-error "(08) ID: not a name: ~S" (rod-string value
)))
822 (when (eq (gethash value
(id-table ctx
)) t
)
823 (validity-error "(08) ID: ~S not unique" (rod-string value
)))
824 (setf (gethash value
(id-table ctx
)) t
))
826 (validate-idref ctx value
))
828 (let ((names (split-names value
)))
830 (validity-error "(11) IDREF: malformed names"))
831 (mapc (curry #'validate-idref ctx
) names
)))
833 (validate-nmtoken value
))
835 (let ((tokens (split-names value
)))
837 (validity-error "(13) Name Token: malformed NMTOKENS"))
838 (mapc #'validate-nmtoken tokens
)))
840 (unless (member value
(cdr type
) :test
#'rod
=)
841 (validity-error "(17) Enumeration: value not declared: ~S"
842 (rod-string value
))))
844 (unless (member value
(cdr type
) :test
#'rod
=)
845 (validity-error "(14) Notation Attributes: ~S" (rod-string value
))))
847 (validate-entity value
))
849 (let ((names (split-names value
)))
851 (validity-error "(13) Name Token: malformed NMTOKENS"))
852 (mapc #'validate-entity names
)))
855 (defun validate-idref (ctx value
)
856 (unless (valid-name-p value
)
857 (validity-error "(11) IDREF: not a name: ~S" (rod-string value
)))
858 (unless (gethash value
(id-table ctx
))
859 (setf (gethash value
(id-table ctx
)) nil
)))
861 (defun validate-nmtoken (value)
862 (unless (valid-nmtoken-p value
)
863 (validity-error "(13) Name Token: not a NMTOKEN: ~S"
864 (rod-string value
))))
866 (defstruct (entdef (:constructor
)))
868 (defstruct (internal-entdef
870 (:constructor make-internal-entdef
(value))
871 (:conc-name
#:entdef-
))
872 (value (error "missing argument") :type rod
)
874 (external-subset-p *external-subset-p
*))
876 (defstruct (external-entdef
878 (:constructor make-external-entdef
(extid ndata
))
879 (:conc-name
#:entdef-
))
880 (extid (error "missing argument") :type extid
)
881 (ndata nil
:type
(or rod null
)))
883 (defun validate-entity (value)
884 (unless (valid-name-p value
)
885 (validity-error "(12) Entity Name: not a name: ~S" (rod-string value
)))
886 (let ((def (let ((*validate
*
887 ;; Similarly the entity refs are internal and
888 ;; don't need normalization ... the unparsed
889 ;; entities (and entities) aren't "references"
890 ;; -- sun/valid/sa03.xml
892 (get-entity-definition value
:general
(dtd *ctx
*)))))
893 (unless (and (typep def
'external-entdef
) (entdef-ndata def
))
895 (validity-error "(12) Entity Name: ~S" (rod-string value
)))))
897 (defun split-names (rod)
898 (flet ((whitespacep (x)
899 (or (rune= x
#/U
+0009)
902 (rune= x
#/U
+0020))))
903 (if (let ((n (length rod
)))
905 (or (whitespacep (rune rod
0))
906 (whitespacep (rune rod
(1- n
))))))
908 (split-sequence-if #'whitespacep rod
:remove-empty-subseqs t
))))
910 (defun zstream-base-sysid (zstream)
912 (dolist (k (zstream-input-stack zstream
))
913 (let ((base-sysid (stream-name-uri (xstream-name k
))))
914 (when base-sysid
(return base-sysid
))))))
917 (defun absolute-uri (sysid source-stream
)
918 (let ((base-sysid (zstream-base-sysid source-stream
)))
919 ;; XXX is the IF correct?
921 (puri:merge-uris sysid base-sysid
)
924 (defstruct (extid (:constructor make-extid
(public system
)))
925 (public nil
:type
(or rod null
))
926 (system (error "missing argument") :type
(or puri
:uri null
)))
928 (setf (documentation 'extid
'type
)
929 "Represents an External ID, consisting of a Public ID and a System ID.
931 @see-constructor{make-extiid}
932 @see-slot{exitid-system}
933 @see-slot{exitid-public}")
935 (setf (documentation #'make-extid
'function
)
936 "@arg[publicid]{string or nil}
937 @arg[systemid]{@class{puri:uri} or nil}
938 @return{an instance of @class{extid}}
940 Create an object representing the External ID composed
941 of the specified Public ID and System ID.")
943 (setf (documentation #'extid-public
'function
)
944 "@arg[extid]{A @class{extid}}
945 @return[publicid]{string or nil}
947 Returns the Public ID part of this External ID.")
949 (setf (documentation #'extid-system
'function
)
950 "@arg[extid]{A @class{extid}}
951 @return[sytemid]{puri:uri or nil}
953 Returns the System ID part of this External ID.")
955 (defun absolute-extid (source-stream extid
)
956 (let ((sysid (extid-system extid
))
957 (result (copy-extid extid
)))
958 (setf (extid-system result
) (absolute-uri sysid source-stream
))
961 (defun define-entity (source-stream name kind def
)
962 (setf name
(intern-name name
))
963 (when (and sax
:*namespace-processing
* (find #/: name
))
964 (wf-error source-stream
"colon in entity name"))
967 (:general
(dtd-gentities (dtd *ctx
*)))
968 (:parameter
(dtd-pentities (dtd *ctx
*))))))
969 (unless (gethash name table
)
970 (when (and source-stream
(handler *ctx
*))
971 (report-entity (handler *ctx
*) kind name def
))
972 (when (typep def
'external-entdef
)
973 (setf (entdef-extid def
)
974 (absolute-extid source-stream
(entdef-extid def
))))
975 (setf (gethash name table
)
976 (cons *external-subset-p
* def
)))))
978 (defun get-entity-definition (entity-name kind dtd
)
980 (wf-error nil
"entity not defined: ~A" (rod-string entity-name
)))
981 (destructuring-bind (extp &rest def
)
984 (:general
(dtd-gentities dtd
))
985 (:parameter
(dtd-pentities dtd
)))
987 (when (and *validate
* (standalone-p *ctx
*) extp
)
988 (validity-error "(02) Standalone Document Declaration: entity reference: ~S"
989 (rod-string entity-name
)))
992 (defun entity->xstream
(zstream entity-name kind
&optional internalp
)
993 ;; `zstream' is for error messages
994 (let ((def (get-entity-definition entity-name kind
(dtd *ctx
*))))
996 (wf-error zstream
"Entity '~A' is not defined." (rod-string entity-name
)))
1000 (when (and (standalone-p *ctx
*)
1001 (entdef-external-subset-p def
))
1004 "entity declared in external subset, but document is standalone"))
1005 (setf r
(make-rod-xstream (entdef-value def
)))
1006 (setf (xstream-name r
)
1007 (make-stream-name :entity-name entity-name
1013 "entity not internal: ~A" (rod-string entity-name
)))
1014 (when (entdef-ndata def
)
1016 "reference to unparsed entity: ~A"
1017 (rod-string entity-name
)))
1018 (setf r
(xstream-open-extid (extid-using-catalog (entdef-extid def
))))
1019 (setf (stream-name-entity-name (xstream-name r
)) entity-name
1020 (stream-name-entity-kind (xstream-name r
)) kind
)))
1023 (defun checked-get-entdef (name type
)
1024 (let ((def (get-entity-definition name type
(dtd *ctx
*))))
1026 (wf-error nil
"Entity '~A' is not defined." (rod-string name
)))
1029 (defun xstream-open-extid* (entity-resolver pubid sysid
)
1031 (or (funcall (or entity-resolver
(constantly nil
)) pubid sysid
)
1032 (open (uri-to-pathname sysid
)
1033 :element-type
'(unsigned-byte 8)
1034 :direction
:input
))))
1035 (make-xstream stream
1036 :name
(make-stream-name :uri sysid
)
1039 (defun xstream-open-extid (extid)
1040 (xstream-open-extid* (entity-resolver *ctx
*)
1041 (extid-public extid
)
1042 (extid-system extid
)))
1044 (defun call-with-entity-expansion-as-stream (zstream cont name kind internalp
)
1045 ;; `zstream' is for error messages
1046 (let ((in (entity->xstream zstream name kind internalp
)))
1047 (push (stream-name-uri (xstream-name in
)) (base-stack *ctx
*))
1050 (pop (base-stack *ctx
*))
1051 (close-xstream in
))))
1053 (defun ensure-dtd ()
1055 (setf (dtd *ctx
*) (make-dtd))
1056 (define-default-entities)))
1058 (defun define-default-entities ()
1059 (define-entity nil
#"lt" :general
(make-internal-entdef #"<"))
1060 (define-entity nil
#"gt" :general
(make-internal-entdef #">"))
1061 (define-entity nil
#"amp" :general
(make-internal-entdef #"&"))
1062 (define-entity nil
#"apos" :general
(make-internal-entdef #"'"))
1063 (define-entity nil
#"quot" :general
(make-internal-entdef #"\"")))
1066 ;; an attribute definition
1067 element
;name of element this attribute belongs to
1068 name
;name of attribute
1069 type
;type of attribute; either one of :CDATA, :ID, :IDREF, :IDREFS,
1070 ; :ENTITY, :ENTITIES, :NMTOKEN, :NMTOKENS, or
1071 ; (:NOTATION <name>*)
1072 ; (:ENUMERATION <name>*)
1073 default
;default value of attribute:
1074 ; :REQUIRED, :IMPLIED, (:FIXED content) or (:DEFAULT content)
1075 (external-p *external-subset-p
*)
1079 ;; an element definition
1080 name
;name of the element
1081 content
;content model [*]
1082 attributes
;list of defined attributes
1083 compiled-cspec
;cons of validation function for contentspec
1084 (external-p *external-subset-p
*)
1087 ;; [*] in XML it is possible to define attributes before the element
1088 ;; itself is defined and since we hang attribute definitions into the
1089 ;; relevant element definitions, the `content' slot indicates whether an
1090 ;; element was actually defined. It is NIL until set to a content model
1091 ;; when the element type declaration is processed.
1093 (defun %make-rod-hash-table
()
1094 ;; XXX with portable hash tables, this is the only way to case-sensitively
1095 ;; use rods. However, EQUALP often has horrible performance! Most Lisps
1096 ;; provide extensions for user-defined equality, we should use them! There
1097 ;; is also a home-made hash table for rods defined below, written by
1098 ;; Gilbert (I think). We could also use that one, but I would prefer the
1099 ;; first method, even if it's unportable.
1100 (make-hash-table :test
1101 #+rune-is-character
'equal
1102 #-rune-is-character
'equalp
))
1105 (elements (%make-rod-hash-table
)) ;elmdefs
1106 (gentities (%make-rod-hash-table
)) ;general entities
1107 (pentities (%make-rod-hash-table
)) ;parameter entities
1108 (notations (%make-rod-hash-table
)))
1110 (defun make-dtd-cache ()
1111 (puri:make-uri-space
))
1113 (defvar *cache-all-dtds
* nil
)
1114 (defvar *dtd-cache
* (make-dtd-cache))
1116 (defun remdtd (uri dtd-cache
)
1117 (setf uri
(puri:intern-uri uri dtd-cache
))
1119 (and (getf (puri:uri-plist uri
) 'dtd
) t
)
1120 (puri:unintern-uri uri dtd-cache
)))
1122 (defun clear-dtd-cache (dtd-cache)
1123 (puri:unintern-uri t dtd-cache
))
1125 (defun getdtd (uri dtd-cache
)
1126 (getf (puri:uri-plist
(puri:intern-uri uri dtd-cache
)) 'dtd
))
1128 (defun (setf getdtd
) (newval uri dtd-cache
)
1129 (setf (getf (puri:uri-plist
(puri:intern-uri uri dtd-cache
)) 'dtd
) newval
)
1135 (defun find-element (name dtd
)
1136 (gethash name
(dtd-elements dtd
)))
1138 (defun define-element (dtd element-name
&optional content-model
)
1139 (let ((e (find-element element-name dtd
)))
1143 (setf (gethash element-name
(dtd-elements dtd
))
1144 (make-elmdef :name element-name
:content content-model
))
1146 (sax:element-declaration
(handler *ctx
*) element-name content-model
))))
1147 ((null content-model
)
1151 (when (elmdef-content e
)
1152 (validity-error "(05) Unique Element Type Declaration"))
1153 (when (eq content-model
:EMPTY
)
1154 (dolist (ad (elmdef-attributes e
))
1155 (let ((type (attdef-type ad
)))
1156 (when (and (listp type
) (eq (car type
) :NOTATION
))
1157 (validity-error "(16) No Notation on Empty Element: ~S"
1158 (rod-string element-name
)))))))
1159 (sax:element-declaration
(handler *ctx
*) element-name content-model
)
1160 (setf (elmdef-content e
) content-model
)
1161 (setf (elmdef-external-p e
) *external-subset-p
*)
1164 (defvar *redefinition-warning
* nil
)
1166 (defun define-attribute (dtd element name type default
)
1167 (let ((adef (make-attdef :element element
1171 (e (or (find-element element dtd
)
1172 (define-element dtd element
))))
1173 (when (and *validate
* (listp default
))
1174 (unless (eq (attdef-type adef
) :CDATA
)
1175 (setf (second default
) (canon-not-cdata-attval (second default
))))
1176 (validate-attribute* *ctx
* adef
(second default
)))
1177 (cond ((find-attribute e name
)
1178 (when *redefinition-warning
*
1179 (warn "Attribute \"~A\" of \"~A\" not redefined."
1181 (rod-string element
))))
1185 (when (find :ID
(elmdef-attributes e
) :key
#'attdef-type
)
1186 (validity-error "(09) One ID per Element Type: element ~A"
1187 (rod-string element
)))
1188 (unless (member default
'(:REQUIRED
:IMPLIED
))
1189 (validity-error "(10) ID Attribute Default: ~A"
1190 (rod-string element
))))
1191 (flet ((notationp (type)
1192 (and (listp type
) (eq (car type
) :NOTATION
))))
1193 (when (notationp type
)
1194 (when (find-if #'notationp
(elmdef-attributes e
)
1196 (validity-error "(15) One Notation Per Element Type: ~S"
1197 (rod-string element
)))
1198 (when (eq (elmdef-content e
) :EMPTY
)
1199 (validity-error "(16) No Notation on Empty Element: ~S"
1200 (rod-string element
))))))
1201 (sax:attribute-declaration
(handler *ctx
*) element name type default
)
1202 (push adef
(elmdef-attributes e
))))))
1204 (defun find-attribute (elmdef name
)
1205 (find name
(elmdef-attributes elmdef
) :key
#'attdef-name
:test
#'rod
=))
1207 (defun define-notation (dtd name id
)
1208 (let ((ns (dtd-notations dtd
)))
1209 (when (gethash name ns
)
1210 (validity-error "(24) Unique Notation Name: ~S" (rod-string name
)))
1211 (setf (gethash name ns
) id
)))
1213 (defun find-notation (name dtd
)
1214 (gethash name
(dtd-notations dtd
)))
1216 ;;;; ---------------------------------------------------------------------------
1217 ;;;; z streams and lexer
1225 (defun call-with-zstream (fn zstream
)
1227 (funcall fn zstream
)
1228 (dolist (input (zstream-input-stack zstream
))
1229 (cond #-x
&y-streams-are-stream
1231 (close-xstream input
))
1232 #+x
&y-streams-are-stream
1236 (defmacro with-zstream
((zstream &rest args
) &body body
)
1237 `(call-with-zstream (lambda (,zstream
) ,@body
)
1238 (make-zstream ,@args
)))
1240 (defun read-token (input)
1241 (cond ((zstream-token-category input
)
1242 (multiple-value-prog1
1243 (values (zstream-token-category input
)
1244 (zstream-token-semantic input
))
1245 (setf (zstream-token-category input
) nil
1246 (zstream-token-semantic input
) nil
)))
1248 (read-token-2 input
))))
1250 (defun peek-token (input)
1251 (cond ((zstream-token-category input
)
1253 (zstream-token-category input
)
1254 (zstream-token-semantic input
)))
1256 (multiple-value-bind (c s
) (read-token input
)
1257 (setf (zstream-token-category input
) c
1258 (zstream-token-semantic input
) s
))
1259 (values (zstream-token-category input
)
1260 (zstream-token-semantic input
)))))
1262 (defun read-token-2 (input)
1263 (cond ((null (zstream-input-stack input
))
1266 (let ((c (peek-rune (car (zstream-input-stack input
)))))
1268 (cond ((eq (cadr (zstream-input-stack input
)) :stop
)
1271 (close-xstream (pop (zstream-input-stack input
)))
1272 (if (null (zstream-input-stack input
))
1274 (values :S nil
) ;fake #x20 after PE expansion
1277 (read-token-3 input
)))))))
1279 (defvar *data-behaviour
*
1280 ) ;either :DTD or :DOC
1282 (defun read-token-3 (zinput)
1283 (let ((input (car (zstream-input-stack zinput
))))
1285 (let ((c (read-rune input
)))
1287 ;; first the common tokens
1289 (read-token-after-|
<| zinput input
))
1292 (ecase *data-behaviour
*
1294 (cond ((rune= #/\
[ c
) :\
[)
1295 ((rune= #/\
] c
) :\
])
1296 ((rune= #/\
( c
) :\
()
1297 ((rune= #/\
) c
) :\
))
1298 ((rune= #/\| c
) :\|
)
1299 ((rune= #/\
> c
) :\
>)
1300 ((rune= #/\" c
) :\")
1301 ((rune= #/\' c
) :\')
1302 ((rune= #/\
, c
) :\
,)
1303 ((rune= #/\? c
) :\?)
1304 ((rune= #/\
* c
) :\
*)
1305 ((rune= #/\
+ c
) :\
+)
1307 (unread-rune c input
)
1308 (values :nmtoken
(read-name-token input
)))
1310 (let ((q (read-name-token input
)))
1311 (cond ((rod= q
'#.
(string-rod "REQUIRED")) :|
#REQUIRED|
)
1312 ((rod= q
'#.
(string-rod "IMPLIED")) :|
#IMPLIED|
)
1313 ((rod= q
'#.
(string-rod "FIXED")) :|
#FIXED|
)
1314 ((rod= q
'#.
(string-rod "PCDATA")) :|
#PCDATA|
)
1316 (wf-error zinput
"Unknown token: ~S." q
)))))
1317 ((or (rune= c
#/U
+0020)
1323 (cond ((name-start-rune-p (peek-rune input
))
1324 ;; an entity reference
1325 (read-pe-reference zinput
))
1329 (wf-error zinput
"Unexpected character ~S." c
))))
1333 (multiple-value-bind (kind data
) (read-entity-like input
)
1334 (cond ((eq kind
:ENTITY-REFERENCE
)
1335 (values :ENTITY-REF data
))
1336 ((eq kind
:CHARACTER-REFERENCE
)
1338 (with-rune-collector (collect)
1339 (%put-unicode-char data collect
)))))))
1341 (unread-rune c input
)
1342 (values :CDATA
(read-cdata input
)))))))))))
1344 (definline check-rune
(input actual expected
)
1345 (unless (eql actual expected
)
1346 (wf-error input
"expected #/~A but found #/~A"
1347 (rune-char expected
)
1348 (rune-char actual
))))
1350 (defun read-pe-reference (zinput)
1351 (let* ((input (car (zstream-input-stack zinput
)))
1352 (nam (read-name-token input
)))
1353 (check-rune input
#/\
; (read-rune input))
1354 (cond (*expand-pe-p
*
1355 ;; no external entities here!
1356 (let ((i2 (entity->xstream zinput nam
:parameter
)))
1357 (zstream-push i2 zinput
))
1358 (values :S nil
) ;space before inserted PE expansion.
1361 (values :PE-REFERENCE nam
)) )))
1363 (defun read-token-after-|
<|
(zinput input
)
1364 (let ((d (read-rune input
)))
1366 (eox input
"EOF after '<'"))
1368 (read-token-after-|
<!| input
))
1370 (multiple-value-bind (target content
) (read-pi input
)
1371 (cond ((rod= target
'#.
(string-rod "xml"))
1372 (values :xml-decl
(cons target content
)))
1373 ((rod-equal target
'#.
(string-rod "XML"))
1375 "You lost -- no XML processing instructions."))
1376 ((and sax
:*namespace-processing
* (position #/: target
))
1378 "Processing instruction target ~S is not a ~
1382 (values :PI
(cons target content
))))))
1383 ((eq *data-behaviour
* :DTD
)
1384 (unread-rune d input
)
1385 (unless (or (rune= #// d
) (name-start-rune-p d
))
1386 (wf-error zinput
"Expected '!' or '?' after '<' in DTD."))
1387 (values :seen-
< nil
))
1389 (let ((c (peek-rune input
)))
1390 (cond ((name-start-rune-p c
)
1391 (read-tag-2 zinput input
:etag
))
1394 "Expecting name start rune after \"</\".")))))
1395 ((name-start-rune-p d
)
1396 (unread-rune d input
)
1397 (read-tag-2 zinput input
:stag
))
1399 (wf-error zinput
"Expected '!' or '?' after '<' in DTD.")))))
1401 (defun read-token-after-|
<!|
(input)
1402 (let ((d (read-rune input
)))
1404 (eox input
"EOF after \"<!\"."))
1405 ((name-start-rune-p d
)
1406 (unread-rune d input
)
1407 (let ((name (read-name-token input
)))
1408 (cond ((rod= name
'#.
(string-rod "ELEMENT")) :|
<!ELEMENT|
)
1409 ((rod= name
'#.
(string-rod "ENTITY")) :|
<!ENTITY|
)
1410 ((rod= name
'#.
(string-rod "ATTLIST")) :|
<!ATTLIST|
)
1411 ((rod= name
'#.
(string-rod "NOTATION")) :|
<!NOTATION|
)
1412 ((rod= name
'#.
(string-rod "DOCTYPE")) :|
<!DOCTYPE|
)
1414 (wf-error input
"`<!~A' unknown." (rod-string name
))))))
1416 (values :|
<![| nil
))
1418 (setf d
(read-rune input
))
1419 (cond ((rune= #/- d
)
1422 (read-comment-content input
)))
1424 (wf-error input
"Bad character ~S after \"<!-\"" d
))))
1426 (wf-error input
"Bad character ~S after \"<!\"" d
)))))
1428 (definline read-S?
(input)
1429 (while (member (peek-rune input
) '(#/U
+0020 #/U
+0009 #/U
+000A
#/U
+000D
)
1431 (consume-rune input
)))
1433 (defun read-attribute-list (zinput input imagine-space-p
)
1434 (cond ((or imagine-space-p
1435 (let ((c (peek-rune input
)))
1436 (and (not (eq c
:eof
))
1439 (cond ((eq (peek-rune input
) :eof
)
1441 ((name-start-rune-p (peek-rune input
))
1442 (cons (read-attribute zinput input
)
1443 (read-attribute-list zinput input nil
)))
1449 (defun read-entity-like (input)
1450 "Read an entity reference off the xstream `input'. Returns two values:
1451 either :ENTITY-REFERENCE <interned-rod> in case of a named entity
1452 or :CHARACTER-REFERENCE <integer> in case of character references.
1453 The initial #\\& is considered to be consumed already."
1454 (let ((c (peek-rune input
)))
1456 (eox input
"EOF after '&'"))
1458 (values :CHARACTER-REFERENCE
(read-character-reference input
)))
1460 (unless (name-start-rune-p (peek-rune input
))
1461 (wf-error input
"Expecting name after &."))
1462 (let ((name (read-name-token input
)))
1463 (setf c
(read-rune input
))
1464 (unless (rune= c
#/\
;)
1465 (wf-error input
"Expected \";\"."))
1466 (values :ENTITY-REFERENCE name
))))))
1468 (defun read-tag-2 (zinput input kind
)
1469 (let ((name (read-name-token input
))
1471 (setf atts
(read-attribute-list zinput input nil
))
1473 ;; check for double attributes
1474 (do ((q atts
(cdr q
)))
1476 (cond ((find (caar q
) (cdr q
) :key
#'car
)
1477 (wf-error zinput
"Attribute ~S has two definitions in element ~S."
1478 (rod-string (caar q
))
1479 (rod-string name
)))))
1481 (cond ((eq (peek-rune input
) #/>)
1482 (consume-rune input
)
1483 (values kind
(cons name atts
)))
1484 ((eq (peek-rune input
) #//)
1485 (consume-rune input
)
1486 (check-rune input
#/> (read-rune input
))
1487 (values :ztag
(cons name atts
)))
1489 (wf-error zinput
"syntax error in read-tag-2.")) )))
1491 (defun read-attribute (zinput input
)
1492 (unless (name-start-rune-p (peek-rune input
))
1493 (wf-error zinput
"Expected name."))
1494 ;; arg thanks to the post mortem nature of name space declarations,
1495 ;; we could only process the attribute values post mortem.
1496 (let ((name (read-name-token input
)))
1497 (while (let ((c (peek-rune input
)))
1498 (and (not (eq c
:eof
))
1499 (or (rune= c
#/U
+0020)
1502 (rune= c
#/U
+000D
))))
1503 (consume-rune input
))
1504 (unless (eq (read-rune input
) #/=)
1505 (wf-error zinput
"Expected \"=\"."))
1506 (while (let ((c (peek-rune input
)))
1507 (and (not (eq c
:eof
))
1508 (or (rune= c
#/U
+0020)
1511 (rune= c
#/U
+000D
))))
1512 (consume-rune input
))
1513 (cons name
(read-att-value-2 input
))))
1515 (defun canon-not-cdata-attval (value)
1516 ;; | If the declared value is not CDATA, then the XML processor must
1517 ;; | further process the normalized attribute value by discarding any
1518 ;; | leading and trailing space (#x20) characters, and by replacing
1519 ;; | sequences of space (#x20) characters by a single space (#x20)
1521 (with-rune-collector (collect)
1522 (let ((gimme-20 nil
)
1523 (anything-seen-p nil
))
1524 (map nil
(lambda (c)
1525 (cond ((rune= c
#/u
+0020)
1528 (when (and anything-seen-p gimme-20
)
1531 (setf anything-seen-p t
)
1535 (definline data-rune-p
(rune)
1536 ;; Any Unicode character, excluding FFFE, and FFFF.
1537 ;; Allow surrogates if using UTF-16, else allow >= 0x10000.
1538 (let ((c (rune-code rune
)))
1539 (or (= c
#x9
) (= c
#xA
) (= c
#xD
)
1541 #+rune-is-utf-16
(<= #xD800 c
#xDFFF
)
1542 (<= #xE000 c
#xFFFD
)
1543 #-rune-is-utf-16
(<= #x10000 c
#x10FFFF
))))
1545 (defun read-att-value (zinput input mode
&optional canon-space-p
(delim nil
))
1546 (with-rune-collector-2 (collect)
1547 (labels ((muffle (input delim
)
1550 (setf c
(read-rune input
))
1551 (cond ((eql delim c
)
1556 (setf c
(peek-rune input
))
1560 (let ((c (read-character-reference input
)))
1561 (%put-unicode-char c collect
)))
1563 (unless (name-start-rune-p (peek-rune input
))
1564 (wf-error zinput
"Expecting name after &."))
1565 (let ((name (read-name-token input
)))
1566 (setf c
(read-rune input
))
1567 (check-rune input c
#/\
;)
1571 zinput name
:general
1573 (muffle (car (zstream-input-stack zinput
))
1577 ;; bypass, but never the less we
1578 ;; need to check for legal
1580 ;; Must it be defined?
1581 ;; allerdings: unparsed sind verboten
1583 (map nil
(lambda (x) (collect x
)) name
)
1584 (collect #/\
; )))))))
1585 ((and (eq mode
:ENT
) (rune= c
#/%
))
1586 (let ((d (peek-rune input
)))
1589 (unless (name-start-rune-p d
)
1590 (wf-error zinput
"Expecting name after %.")))
1591 (let ((name (read-name-token input
)))
1592 (setf c
(read-rune input
))
1593 (check-rune input c
#/\
;)
1594 (cond (*expand-pe-p
*
1596 zinput name
:parameter
1598 (muffle (car (zstream-input-stack zinput
))
1601 (wf-error zinput
"No PE here.")))))
1602 ((and (eq mode
:ATT
) (rune= c
#/<))
1603 (wf-error zinput
"unexpected #\/<"))
1604 ((and canon-space-p
(space-rune-p c
))
1606 ((not (data-rune-p c
))
1607 (wf-error zinput
"illegal char: ~S." c
))
1610 (declare (dynamic-extent #'muffle
))
1611 (muffle input
(or delim
1612 (let ((delim (read-rune input
)))
1613 (unless (member delim
'(#/\" #/\') :test
#'eql
)
1614 (wf-error zinput
"invalid attribute delimiter"))
1617 (defun read-character-reference (input)
1618 ;; The #/& is already read
1620 (let ((c (read-rune input
)))
1621 (check-rune input c
#/#)
1622 (setq c
(read-rune input
))
1627 (setq c
(read-rune input
))
1630 (unless (digit-rune-p c
16)
1631 (wf-error input
"garbage in character reference"))
1634 (with-output-to-string (sink)
1635 (write-char (rune-char c
) sink
)
1637 (setq c
(read-rune input
))
1640 (digit-rune-p c
16))
1641 (write-char (rune-char c
) sink
)))
1643 (check-rune input c
#/\
;)))
1648 (with-output-to-string (sink)
1649 (write-char (rune-char c
) sink
)
1651 (setq c
(read-rune input
))
1655 (write-char (rune-char c
) sink
)))
1657 (check-rune input c
#/\
;)))
1659 (wf-error input
"Bad char in numeric character entity."))))))
1660 (unless (code-data-char-p res
)
1663 "expansion of numeric character reference (#x~X) is no data char."
1667 (defun read-pi (input)
1668 ;; "<?" is already read
1670 (let ((c (peek-rune input
)))
1671 (unless (name-start-rune-p c
)
1672 (wf-error input
"Expecting name after '<?'"))
1673 (setf name
(read-name-token input
)))
1675 ((member (peek-rune input
) '(#/U
+0020 #/U
+0009 #/U
+000A
#/U
+000D
)
1677 (values name
(read-pi-content input
)))
1679 (unless (and (eql (read-rune input
) #/?
)
1680 (eql (read-rune input
) #/>))
1681 (wf-error input
"malformed processing instruction"))
1682 (values name
"")))))
1684 (defun read-pi-content (input)
1687 (with-rune-collector (collect)
1691 (setf d
(read-rune input
))
1694 (unless (data-rune-p d
)
1695 (wf-error input
"Illegal char: ~S." d
))
1696 (when (rune= d
#/?
) (go state-2
))
1700 (setf d
(read-rune input
))
1703 (unless (data-rune-p d
)
1704 (wf-error input
"Illegal char: ~S." d
))
1705 (when (rune= d
#/>) (return))
1713 (defun read-comment-content (input &aux d
)
1714 (with-rune-collector (collect)
1718 (setf d
(read-rune input
))
1721 (unless (data-rune-p d
)
1722 (wf-error input
"Illegal char: ~S." d
))
1723 (when (rune= d
#/-
) (go state-2
))
1727 (setf d
(read-rune input
))
1730 (unless (data-rune-p d
)
1731 (wf-error input
"Illegal char: ~S." d
))
1732 (when (rune= d
#/-
) (go state-3
))
1736 state-3
;; #/- #/- seen
1737 (setf d
(read-rune input
))
1740 (unless (data-rune-p d
)
1741 (wf-error input
"Illegal char: ~S." d
))
1742 (when (rune= d
#/>) (return))
1743 (wf-error input
"'--' not allowed in a comment")
1752 (defun read-cdata-sect (input &aux d
)
1753 ;; <![CDATA[ is already read
1754 ;; read anything up to ]]>
1755 (with-rune-collector (collect)
1759 (setf d
(read-rune input
))
1762 (unless (data-rune-p d
)
1763 (wf-error input
"Illegal char: ~S." d
))
1764 (when (rune= d
#/\
]) (go state-2
))
1768 (setf d
(read-rune input
))
1771 (unless (data-rune-p d
)
1772 (wf-error input
"Illegal char: ~S." d
))
1773 (when (rune= d
#/\
]) (go state-3
))
1777 state-3
;; #/\] #/\] seen
1778 (setf d
(read-rune input
))
1781 (unless (data-rune-p d
)
1782 (wf-error input
"Illegal char: ~S." d
))
1785 (when (rune= d
#/\
])
1793 ;; some character categories
1795 (defun space-rune-p (rune)
1796 (declare (type rune rune
))
1797 (or (rune= rune
#/U
+0020)
1798 (rune= rune
#/U
+0009)
1799 (rune= rune
#/U
+000A
)
1800 (rune= rune
#/U
+000D
)))
1802 (defun code-data-char-p (c)
1803 ;; Any Unicode character, excluding FFFE, and FFFF.
1804 ;; Allow surrogates if using UTF-16, else allow >= 0x10000.
1805 (or (= c
#x9
) (= c
#xA
) (= c
#xD
)
1807 #+rune-is-utf-16
(<= #xD800 c
#xDFFF
)
1808 (<= #xE000 c
#xFFFD
)
1809 #-rune-is-utf-16
(<= #x10000 c
#x10FFFF
)))
1811 (defun pubid-char-p (c)
1812 (or (rune= c
#/u
+0020) (rune= c
#/u
+000D
) (rune= c
#/u
+000A
)
1816 (member c
'(#/-
#/' #/\
( #/\
) #/+ #/, #/.
#//
1817 #/: #/= #/?
#/\
; #/! #/* #/#
1821 (defun expect (input category
)
1822 (multiple-value-bind (cat sem
) (read-token input
)
1823 (unless (eq cat category
)
1824 (wf-error input
"Expected ~S saw ~S [~S]" category cat sem
))
1827 (defun consume-token (input)
1830 ;;;; ---------------------------------------------------------------------------
1835 ;; S ::= (#x20 | #x9 | #xD | #xA)+
1837 (while (eq (peek-token input
) :S
)
1838 (consume-token input
)))
1841 ;; S ::= (#x20 | #x9 | #xD | #xA)+
1842 (while (eq (peek-token input
) :S
)
1843 (consume-token input
)))
1845 (defun p/nmtoken
(input)
1846 (nth-value 1 (expect input
:nmtoken
)))
1848 (defun p/name
(input)
1849 (let ((result (p/nmtoken input
)))
1850 (unless (name-start-rune-p (elt result
0))
1851 (wf-error input
"Expected name."))
1854 (defun p/attlist-decl
(input)
1855 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name (S AttDef)* S? '>'
1857 (expect input
:|
<!ATTLIST|
)
1859 (setf elm-name
(p/nmtoken input
))
1861 (let ((tok (read-token input
)))
1865 (cond ((eq (peek-token input
) :>)
1866 (consume-token input
)
1869 (multiple-value-bind (name type default
) (p/attdef input
)
1870 (define-attribute (dtd *ctx
*) elm-name name type default
)) )))
1875 "Expected either another AttDef or end of \"<!ATTLIST\". -- saw ~S."
1878 (defun p/attdef
(input)
1879 ;; [53] AttDef ::= Name S AttType S DefaultDecl
1880 (let (name type default
)
1881 (setf name
(p/nmtoken input
))
1883 (setf type
(p/att-type input
))
1885 (setf default
(p/default-decl input
))
1886 (values name type default
)))
1888 (defun p/list
(input item-parser delimiter
)
1889 ;; Parse something like S? <item> (S? <delimiter> <item>)* S?
1891 (declare (type function item-parser
))
1894 (setf res
(list (funcall item-parser input
)))
1897 (cond ((eq (peek-token input
) delimiter
)
1898 (consume-token input
)
1900 (push (funcall item-parser input
) res
))
1906 (defun p/att-type
(input)
1907 ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
1908 ;; [55] StringType ::= 'CDATA'
1909 ;; [56] TokenizedType ::= 'ID' /*VC: ID */
1910 ;; /*VC: One ID per Element Type */
1911 ;; /*VC: ID Attribute Default */
1912 ;; | 'IDREF' /*VC: IDREF */
1913 ;; | 'IDREFS' /*VC: IDREF */
1914 ;; | 'ENTITY' /*VC: Entity Name */
1915 ;; | 'ENTITIES' /*VC: Entity Name */
1916 ;; | 'NMTOKEN' /*VC: Name Token */
1917 ;; | 'NMTOKENS' /*VC: Name Token */
1918 ;; [57] EnumeratedType ::= NotationType | Enumeration
1919 ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
1920 ;; /* VC: Notation Attributes */
1921 ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' /* VC: Enumeration */
1922 (multiple-value-bind (cat sem
) (read-token input
)
1923 (cond ((eq cat
:nmtoken
)
1924 (cond ((rod= sem
'#.
(string-rod "CDATA")) :CDATA
)
1925 ((rod= sem
'#.
(string-rod "ID")) :ID
)
1926 ((rod= sem
'#.
(string-rod "IDREF")) :IDREFS
)
1927 ((rod= sem
'#.
(string-rod "IDREFS")) :IDREFS
)
1928 ((rod= sem
'#.
(string-rod "ENTITY")) :ENTITY
)
1929 ((rod= sem
'#.
(string-rod "ENTITIES")) :ENTITIES
)
1930 ((rod= sem
'#.
(string-rod "NMTOKEN")) :NMTOKEN
)
1931 ((rod= sem
'#.
(string-rod "NMTOKENS")) :NMTOKENS
)
1932 ((rod= sem
'#.
(string-rod "NOTATION"))
1936 (setf names
(p/list input
#'p
/nmtoken
:\|
))
1939 (setf (referenced-notations *ctx
*)
1940 (append names
(referenced-notations *ctx
*))))
1941 (cons :NOTATION names
)))
1943 (wf-error input
"In p/att-type: ~S ~S." cat sem
))))
1945 ;; XXX Die Nmtoken-Syntax pruefen wir derzeit nur beim Validieren.
1947 ;;(expect input :\()
1948 (setf names
(p/list input
#'p
/nmtoken
:\|
))
1950 (cons :ENUMERATION names
)))
1952 (wf-error input
"In p/att-type: ~S ~S." cat sem
)) )))
1954 (defun p/default-decl
(input)
1955 ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1956 ;; | (('#FIXED' S)? AttValue) /* VC: Required Attribute */
1958 ;; /* VC: Attribute Default Legal */
1959 ;; /* WFC: No < in Attribute Values */
1960 ;; /* VC: Fixed Attribute Default */
1961 (multiple-value-bind (cat sem
) (peek-token input
)
1962 (cond ((eq cat
:|
#REQUIRED|
)
1963 (consume-token input
) :REQUIRED
)
1964 ((eq cat
:|
#IMPLIED|
)
1965 (consume-token input
) :IMPLIED
)
1967 (consume-token input
)
1969 (list :FIXED
(p/att-value input
)))
1970 ((or (eq cat
:\') (eq cat
:\"))
1971 (list :DEFAULT
(p/att-value input
)))
1973 (wf-error input
"p/default-decl: ~S ~S." cat sem
)) )))
1976 ;; [70] EntityDecl ::= GEDecl | PEDecl
1977 ;; [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1978 ;; [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1979 ;; [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1980 ;; [74] PEDef ::= EntityValue | ExternalID
1981 ;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1982 ;; | 'PUBLIC' S PubidLiteral S SystemLiteral
1983 ;; [76] NDataDecl ::= S 'NDATA' S Name /* VC: Notation Declared */
1985 (defun p/entity-decl
(input)
1986 (let (name def kind
)
1987 (expect input
:|
<!ENTITY|
)
1989 (cond ((eq (peek-token input
) :%
)
1990 (setf kind
:parameter
)
1991 (consume-token input
)
1994 (setf kind
:general
)))
1995 (setf name
(p/name input
))
1997 (setf def
(p/entity-def input kind
))
1998 (define-entity input name kind def
)
2000 (expect input
:\
>)))
2002 (defun report-entity (h kind name def
)
2005 (let ((extid (entdef-extid def
))
2006 (ndata (entdef-ndata def
)))
2008 (sax:unparsed-entity-declaration h
2010 (extid-public extid
)
2011 (uri-rod (extid-system extid
))
2013 (sax:external-entity-declaration h
2016 (extid-public extid
)
2017 (uri-rod (extid-system extid
))))))
2019 (sax:internal-entity-declaration h kind name
(entdef-value def
)))))
2021 (defun p/entity-def
(input kind
)
2022 (multiple-value-bind (cat sem
) (peek-token input
)
2023 (cond ((member cat
'(:\" :\'))
2024 (make-internal-entdef (p/entity-value input
)))
2025 ((and (eq cat
:nmtoken
)
2026 (or (rod= sem
'#.
(string-rod "SYSTEM"))
2027 (rod= sem
'#.
(string-rod "PUBLIC"))))
2029 (setf extid
(p/external-id input nil
))
2030 (when (eq kind
:general
) ;NDATA allowed at all?
2031 (cond ((eq (peek-token input
) :S
)
2033 (when (and (eq (peek-token input
) :nmtoken
)
2034 (rod= (nth-value 1 (peek-token input
))
2035 '#.
(string-rod "NDATA")))
2036 (consume-token input
)
2038 (setf ndata
(p/nmtoken input
))
2040 (push ndata
(referenced-notations *ctx
*)))))))
2041 (make-external-entdef extid ndata
)))
2043 (wf-error input
"p/entity-def: ~S / ~S." cat sem
)) )))
2045 (defun p/entity-value
(input)
2046 (let ((delim (if (eq (read-token input
) :\") #/\" #/\')))
2047 (read-att-value input
2048 (car (zstream-input-stack input
))
2053 (defun p/att-value
(input)
2054 (let ((delim (if (eq (read-token input
) :\") #/\" #/\')))
2055 (read-att-value input
2056 (car (zstream-input-stack input
))
2061 (defun p/external-id
(input &optional
(public-only-ok-p nil
))
2062 ;; xxx public-only-ok-p
2063 (multiple-value-bind (cat sem
) (read-token input
)
2064 (cond ((and (eq cat
:nmtoken
) (rod= sem
'#.
(string-rod "SYSTEM")))
2066 (make-extid nil
(p/system-literal input
)))
2067 ((and (eq cat
:nmtoken
) (rod= sem
'#.
(string-rod "PUBLIC")))
2070 (setf pub
(p/pubid-literal input
))
2071 (when (eq (peek-token input
) :S
)
2073 (when (member (peek-token input
) '(:\" :\'))
2074 (setf sys
(p/system-literal input
))))
2075 (when (and (not public-only-ok-p
)
2077 (wf-error input
"System identifier needed for this PUBLIC external identifier."))
2078 (make-extid pub sys
)))
2080 (wf-error input
"Expected external-id: ~S / ~S." cat sem
)))))
2083 ;; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2084 ;; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2085 ;; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
2086 ;; | [-'()+,./:=?;!*#@$_%]
2089 (multiple-value-bind (cat) (read-token input
)
2090 (cond ((member cat
'(:\" :\'))
2091 (let ((delim (if (eq cat
:\") #/\" #/\')))
2092 (with-rune-collector (collect)
2094 (let ((c (read-rune (car (zstream-input-stack input
)))))
2096 (eox input
"EOF in system literal."))
2102 (wf-error input
"Expect either \" or \'.")))))
2104 ;; it is important to cache the orginal URI rod, since the re-serialized
2105 ;; uri-string can be different from the one parsed originally.
2106 (defun uri-rod (uri)
2108 (or (getf (puri:uri-plist uri
) 'original-rod
)
2109 (rod (puri:render-uri uri nil
)))
2112 (defun safe-parse-uri (str)
2113 ;; puri doesn't like strings starting with file:///, although that is a very
2114 ;; common is practise. Cut it away, we don't distinguish between scheme
2115 ;; :FILE and NIL anway.
2116 (when (eql (search "file://" str
) 0)
2117 (setf str
(subseq str
(length "file://"))))
2118 (puri:parse-uri
(coerce str
'simple-string
)))
2120 (defun p/system-literal
(input)
2121 (let* ((rod (p/id input
))
2122 (result (safe-parse-uri (rod-string rod
))))
2123 (setf (getf (puri:uri-plist result
) 'original-rod
) rod
)
2126 (defun p/pubid-literal
(input)
2127 (let ((result (p/id input
)))
2128 (unless (every #'pubid-char-p result
)
2129 (wf-error input
"Illegal pubid: ~S." (rod-string result
)))
2135 (defun p/element-decl
(input)
2137 (expect input
:|
<!ELEMENT|
)
2139 (setf name
(p/nmtoken input
))
2141 (setf content
(normalize-mixed-cspec (p/cspec input
)))
2142 (unless (legal-content-model-p content
*validate
*)
2143 (wf-error input
"Malformed or invalid content model: ~S." (mu content
)))
2146 (define-element (dtd *ctx
*) name content
)
2147 (list :element name content
)))
2149 (defun maybe-compile-cspec (e)
2150 (or (elmdef-compiled-cspec e
)
2151 (setf (elmdef-compiled-cspec e
)
2152 (let ((cspec (elmdef-content e
)))
2154 (validity-error "(03) Element Valid: no definition for ~A"
2155 (rod-string (elmdef-name e
))))
2156 (multiple-value-call #'cons
2157 (compile-cspec cspec
(standalone-check-necessary-p e
)))))))
2159 (defun make-root-model (name)
2160 (cons (lambda (actual-name)
2161 (if (rod= actual-name name
)
2166 ;;; content spec validation:
2168 ;;; Given a `contentspec', COMPILE-CSPEC returns as multiple values two
2169 ;;; functions A and B of one argument to be called for every
2170 ;;; A. child element
2171 ;;; B. text child node
2173 ;;; Function A will be called with
2174 ;;; - the element name rod as its argument. If that element may appear
2175 ;;; at the current position, a new function to be called for the next
2176 ;;; child is returned. Otherwise NIL is returned.
2177 ;;; - argument NIL at the end of the element, it must then return T or NIL
2178 ;;; to indicate whether the end tag is valid.
2180 ;;; Function B will be called with the character data rod as its argument, it
2181 ;;; returns a boolean indicating whether this text node is allowed.
2183 ;;; That is, if one of the functions ever returns NIL, the node is
2184 ;;; rejected as invalid.
2186 (defun cmodel-done (actual-value)
2187 (null actual-value
))
2189 (defun compile-cspec (cspec &optional standalone-check
)
2193 (:EMPTY
(values #'cmodel-done
(constantly nil
)))
2194 (:PCDATA
(values #'cmodel-done
(constantly t
)))
2196 (values (labels ((doit (name) (if name
#'doit t
))) #'doit
)
2198 ((and (eq (car cspec
) '*)
2199 (let ((subspec (second cspec
)))
2200 (and (eq (car subspec
) 'or
) (eq (cadr subspec
) :PCDATA
))))
2201 (values (compile-mixed (second cspec
))
2204 (values (compile-content-model cspec
)
2206 (when standalone-check
2207 (validity-error "(02) Standalone Document Declaration: whitespace"))
2208 (every #'white-space-rune-p rod
))))))
2210 (defun compile-mixed (cspec)
2211 ;; das koennten wir theoretisch auch COMPILE-CONTENT-MODEL erledigen lassen
2212 (let ((allowed-names (cddr cspec
)))
2213 (labels ((doit (actual-name)
2215 ((null actual-name
) t
)
2216 ((member actual-name allowed-names
:test
#'rod
=) #'doit
)
2220 (defun compile-content-model (cspec &optional
(continuation #'cmodel-done
))
2222 (lambda (actual-name)
2223 (if (and actual-name
(rod= cspec actual-name
))
2228 (labels ((traverse (seq)
2229 (compile-content-model (car seq
)
2231 (traverse (cdr seq
))
2233 (traverse (cdr cspec
))))
2235 (let ((options (mapcar (rcurry #'compile-content-model continuation
)
2237 (lambda (actual-name)
2238 (some (rcurry #'funcall actual-name
) options
))))
2240 (let ((maybe (compile-content-model (second cspec
) continuation
)))
2241 (lambda (actual-name)
2242 (or (funcall maybe actual-name
)
2243 (funcall continuation actual-name
)))))
2245 (let (maybe-continuation)
2246 (labels ((recurse (actual-name)
2247 (if (null actual-name
)
2248 (funcall continuation actual-name
)
2249 (or (funcall maybe-continuation actual-name
)
2250 (funcall continuation actual-name
)))))
2251 (setf maybe-continuation
2252 (compile-content-model (second cspec
) #'recurse
))
2255 (let ((it (cadr cspec
)))
2256 (compile-content-model `(and ,it
(* ,it
)) continuation
))))))
2258 (defun setp (list &key
(test 'eql
))
2259 (equal list
(remove-duplicates list
:test test
)))
2261 (defun legal-content-model-p (cspec &optional validate
)
2262 (or (eq cspec
:PCDATA
)
2267 (consp (cadr cspec
))
2268 (eq (car (cadr cspec
)) 'or
)
2269 (eq (cadr (cadr cspec
)) :PCDATA
)
2270 (every #'vectorp
(cddr (cadr cspec
)))
2271 (if (and validate
(not (setp (cddr (cadr cspec
)) :test
#'rod
=)))
2272 (validity-error "VC: No Duplicate Types (07)")
2275 (cond ((member x
'(:PCDATA
:ANY
:EMPTY
))
2278 ((and (walk (car x
))
2282 ;; wir fahren besser, wenn wir machen:
2284 ;; cspec ::= 'EMPTY' | 'ANY' | '#PCDATA'
2287 ;; cs ::= '(' S? cspec ( S? '|' S? cspec)* S? ')' ('?' | '*' | '+')?
2288 ;; und eine post factum analyse
2290 (defun p/cspec
(input &optional recursivep
)
2292 (let ((names nil
) op-cat op res stream
)
2293 (multiple-value-bind (cat sem
) (peek-token input
)
2294 (cond ((eq cat
:nmtoken
)
2295 (consume-token input
)
2296 (cond ((rod= sem
'#.
(string-rod "EMPTY"))
2298 ((rod= sem
'#.
(string-rod "ANY"))
2301 (wf-error input
"invalid content spec"))
2305 (consume-token input
)
2308 (setf stream
(car (zstream-input-stack input
)))
2309 (consume-token input
)
2311 (setq names
(list (p/cspec input t
)))
2313 (cond ((member (peek-token input
) '(:\|
:\
,))
2314 (setf op-cat
(peek-token input
))
2315 (setf op
(if (eq op-cat
:\
,) 'and
'or
))
2316 (while (eq (peek-token input
) op-cat
)
2317 (consume-token input
)
2319 (push (p/cspec input t
) names
)
2321 (setf res
(cons op
(reverse names
))))
2323 (setf res
(cons 'and names
))))
2327 (unless (eq stream
(car (zstream-input-stack input
)))
2328 (validity-error "(06) Proper Group/PE Nesting")))
2331 (wf-error input
"p/cspec - ~s / ~s" cat sem
)))))))
2332 (cond ((eq (peek-token input
) :?
) (consume-token input
) (list '? term
))
2333 ((eq (peek-token input
) :+) (consume-token input
) (list '+ term
))
2334 ((eq (peek-token input
) :*) (consume-token input
) (list '* term
))
2338 (defun normalize-mixed-cspec (cspec)
2339 ;; der Parser oben funktioniert huebsch fuer die children-Regel, aber
2340 ;; fuer Mixed ist das Ergebnis nicht praktisch, denn dort wollen wir
2341 ;; eigentlich auf eine Liste von Namen in einheitlichem Format hinaus.
2342 ;; Dazu normalisieren wir einfach in eine der beiden folgenden Formen:
2343 ;; (* (or :PCDATA ...rods...)) -- und zwar exakt so!
2344 ;; :PCDATA -- sonst ganz trivial
2345 (flet ((trivialp (c)
2347 (and (eq (car c
) 'and
)
2348 (eq (cadr c
) :PCDATA
)
2350 (if (or (trivialp cspec
) ;(and PCDATA)
2351 (and (consp cspec
) ;(* (and PCDATA))
2352 (and (eq (car cspec
) '*)
2354 (trivialp (cadr cspec
)))))
2358 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
2361 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDefs S? '>'
2362 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name S? '>'
2363 ;; [53] AttDefs ::= S Name S AttType S DefaultDecl AttDefs
2366 (defun p/notation-decl
(input)
2368 (expect input
:|
<!NOTATION|
)
2370 (setf name
(p/name input
))
2372 (setf id
(p/external-id input t
))
2375 (sax:notation-declaration
(handler *ctx
*)
2377 (if (extid-public id
)
2378 (normalize-public-id (extid-public id
))
2380 (uri-rod (extid-system id
)))
2381 (when (and sax
:*namespace-processing
* (find #/: name
))
2382 (wf-error input
"colon in notation name"))
2384 (define-notation (dtd *ctx
*) name id
))
2385 (list :notation-decl name id
)))
2387 (defun normalize-public-id (rod)
2388 (with-rune-collector (collect)
2389 (let ((gimme-20 nil
)
2390 (anything-seen-p nil
))
2391 (map nil
(lambda (c)
2393 ((or (rune= c
#/u
+0009)
2399 (when (and anything-seen-p gimme-20
)
2402 (setf anything-seen-p t
)
2408 (defun p/conditional-sect
(input)
2409 (expect input
:<!\
[ )
2410 (let ((stream (car (zstream-input-stack input
))))
2412 (multiple-value-bind (cat sem
) (read-token input
)
2413 (cond ((and (eq cat
:nmtoken
)
2414 (rod= sem
'#.
(string-rod "INCLUDE")))
2415 (p/include-sect input stream
))
2416 ((and (eq cat
:nmtoken
)
2417 (rod= sem
'#.
(string-rod "IGNORE")))
2418 (p/ignore-sect input stream
))
2420 (wf-error input
"Expected INCLUDE or IGNORE after \"<![\"."))))))
2422 (defun p/cond-expect
(input cat initial-stream
)
2425 (unless (eq (car (zstream-input-stack input
)) initial-stream
)
2426 (validity-error "(21) Proper Conditional Section/PE Nesting"))))
2428 (defun p/include-sect
(input initial-stream
)
2429 ;; <![INCLUDE is already read.
2431 (p/cond-expect input
:\
[ initial-stream
)
2432 (p/ext-subset-decl input
)
2433 (p/cond-expect input
:\
] initial-stream
)
2434 (p/cond-expect input
:\
] initial-stream
)
2435 (p/cond-expect input
:\
> initial-stream
))
2437 (defun p/ignore-sect
(input initial-stream
)
2438 ;; <![IGNORE is already read.
2439 ;; XXX Is VC 21 being checked for nested sections?
2441 (p/cond-expect input
:\
[ initial-stream
)
2442 (let ((input (car (zstream-input-stack input
))))
2444 (do ((c1 (read-rune input
) (read-rune input
))
2448 (declare (type fixnum level
))
2450 (eox input
"EOF in <![IGNORE ... >")))
2451 (cond ((and (rune= c3
#/<) (rune= c2
#/!) (rune= c1
#/\
[))
2453 (cond ((and (rune= c3
#/\
]) (rune= c2
#/\
]) (rune= c1
#/>))
2455 (unless (eq (car (zstream-input-stack input
)) initial-stream
)
2456 (validity-error "(21) Proper Conditional Section/PE Nesting")))
2458 (defun p/ext-subset-decl
(input)
2459 ;; ( markupdecl | conditionalSect | S )*
2461 (case (let ((*expand-pe-p
* nil
)) (peek-token input
))
2462 (:|
<![|
(let ((*expand-pe-p
* t
)) (p/conditional-sect input
)))
2463 (:S
(consume-token input
))
2465 ((:|
<!ELEMENT|
:|
<!ATTLIST|
:|
<!ENTITY|
:|
<!NOTATION|
:PI
:COMMENT
)
2466 (let ((*expand-pe-p
* t
)
2467 (*external-subset-p
* t
))
2468 (p/markup-decl input
)))
2470 (let ((name (nth-value 1 (read-token input
))))
2471 (recurse-on-entity input name
:parameter
2473 (etypecase (checked-get-entdef name
:parameter
)
2475 (p/ext-subset input
))
2477 (p/ext-subset-decl input
)))
2478 (unless (eq :eof
(peek-token input
))
2479 (wf-error input
"Trailing garbage."))))))
2480 (otherwise (return)))) )
2482 (defun p/markup-decl
(input)
2484 (let ((stream (car (zstream-input-stack input
))))
2485 (multiple-value-prog1
2486 (p/markup-decl-unsafe input
)
2488 (unless (eq stream
(car (zstream-input-stack input
)))
2489 (validity-error "(01) Proper Declaration/PE Nesting"))))))
2491 (defun p/markup-decl-unsafe
(input)
2492 ;; markupdecl ::= elementdecl | AttlistDecl /* VC: Proper Declaration/PE Nesting */
2493 ;; | EntityDecl | NotationDecl
2494 ;; | PI | Comment /* WFC: PEs in Internal Subset */
2495 (let ((token (peek-token input
))
2496 (*expand-pe-p
* (and *expand-pe-p
* *external-subset-p
*)))
2498 (:|
<!ELEMENT|
(p/element-decl input
))
2499 (:|
<!ATTLIST|
(p/attlist-decl input
))
2500 (:|
<!ENTITY|
(p/entity-decl input
))
2501 (:|
<!NOTATION|
(p/notation-decl input
))
2503 (let ((sem (nth-value 1 (read-token input
))))
2504 (sax:processing-instruction
(handler *ctx
*) (car sem
) (cdr sem
))))
2505 (:COMMENT
(consume-token input
))
2507 (wf-error input
"p/markup-decl ~S" (peek-token input
))))))
2509 (defun setup-encoding (input xml-header
)
2510 (when (xml-header-encoding xml-header
)
2511 (let ((enc (find-encoding (xml-header-encoding xml-header
))))
2513 (setf (xstream-encoding (car (zstream-input-stack input
)))
2516 (warn "There is no such encoding: ~S." (xml-header-encoding xml-header
)))))))
2518 (defun set-full-speed (input)
2519 (let ((xstream (car (zstream-input-stack input
))))
2521 (set-to-full-speed xstream
))))
2523 (defun p/ext-subset
(input)
2524 (cond ((eq (peek-token input
) :xml-decl
)
2525 (let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input
))))))
2526 (setup-encoding input hd
))
2527 (consume-token input
)))
2528 (set-full-speed input
)
2529 (p/ext-subset-decl input
)
2530 (unless (eq (peek-token input
) :eof
)
2531 (wf-error input
"Trailing garbage - ~S." (peek-token input
))))
2533 (defvar *catalog
* nil
)
2535 (defun extid-using-catalog (extid)
2538 (resolve-extid (extid-public extid
)
2539 (extid-system extid
)
2542 (make-extid nil sysid
)
2546 (defun p/doctype-decl
(input &optional dtd-extid
)
2548 (let ((*expand-pe-p
* nil
)
2550 (expect input
:|
<!DOCTYPE|
)
2552 (setq name
(p/nmtoken input
))
2554 (setf (model-stack *ctx
*) (list (make-root-model name
))))
2555 (when (eq (peek-token input
) :S
)
2557 (unless (or (eq (peek-token input
) :\
[ )
2558 (eq (peek-token input
) :\
> ))
2559 (setf extid
(p/external-id input t
))))
2561 (setf extid dtd-extid
))
2563 (sax:start-dtd
(handler *ctx
*)
2565 (and extid
(extid-public extid
))
2566 (and extid
(uri-rod (extid-system extid
))))
2567 (when (eq (peek-token input
) :\
[ )
2568 (when (disallow-internal-subset *ctx
*)
2569 (wf-error input
"document includes an internal subset"))
2571 (consume-token input
)
2572 (sax:start-internal-subset
(handler *ctx
*))
2573 (while (progn (p/S? input
)
2574 (not (eq (peek-token input
) :\
] )))
2575 (if (eq (peek-token input
) :PE-REFERENCE
)
2576 (let ((name (nth-value 1 (read-token input
))))
2577 (recurse-on-entity input name
:parameter
2579 (etypecase (checked-get-entdef name
:parameter
)
2581 (p/ext-subset input
))
2583 (p/ext-subset-decl input
)))
2584 (unless (eq :eof
(peek-token input
))
2585 (wf-error input
"Trailing garbage.")))))
2586 (let ((*expand-pe-p
* t
))
2587 (p/markup-decl input
))))
2588 (consume-token input
)
2589 (sax:end-internal-subset
(handler *ctx
*))
2593 (let* ((effective-extid
2594 (extid-using-catalog (absolute-extid input extid
)))
2595 (sysid (extid-system effective-extid
))
2596 (fresh-dtd-p (null (dtd *ctx
*)))
2599 (not (standalone-p *ctx
*))
2600 (getdtd sysid
*dtd-cache
*))))
2603 (setf (dtd *ctx
*) cached-dtd
)
2604 (report-cached-dtd cached-dtd
))
2606 (let ((xi2 (xstream-open-extid effective-extid
)))
2607 (with-zstream (zi2 :input-stack
(list xi2
))
2610 (when (and fresh-dtd-p
2613 (not (standalone-p *ctx
*)))
2614 (setf (getdtd sysid
*dtd-cache
*) (dtd *ctx
*)))))))))
2615 (sax:end-dtd
(handler *ctx
*))
2616 (let ((dtd (dtd *ctx
*)))
2617 (sax:entity-resolver
2619 (lambda (name handler
) (resolve-entity name handler dtd
)))
2620 (sax::dtd
(handler *ctx
*) dtd
))
2621 (list :DOCTYPE name extid
))))
2623 (defun report-cached-dtd (dtd)
2624 (maphash (lambda (k v
)
2625 (report-entity (handler *ctx
*) :general k
(cdr v
)))
2626 (dtd-gentities dtd
))
2627 (maphash (lambda (k v
)
2628 (report-entity (handler *ctx
*) :parameter k
(cdr v
)))
2629 (dtd-pentities dtd
))
2630 (maphash (lambda (k v
)
2631 (sax:notation-declaration
2634 (if (extid-public v
)
2635 (normalize-public-id (extid-public v
))
2637 (uri-rod (extid-system v
))))
2638 (dtd-notations dtd
)))
2640 (defun p/misc
*-
2 (input)
2642 (while (member (peek-token input
) '(:COMMENT
:PI
:S
))
2643 (case (peek-token input
)
2645 (sax:comment
(handler *ctx
*) (nth-value 1 (peek-token input
))))
2647 (sax:processing-instruction
2649 (car (nth-value 1 (peek-token input
)))
2650 (cdr (nth-value 1 (peek-token input
))))))
2651 (consume-token input
)))
2655 &key validate dtd root entity-resolver disallow-internal-subset
2657 ;; check types of user-supplied arguments for better error messages:
2658 (check-type validate boolean
)
2659 (check-type recode boolean
)
2660 (check-type dtd
(or null extid
))
2661 (check-type root
(or null rod
))
2662 (check-type entity-resolver
(or null function symbol
))
2663 (check-type disallow-internal-subset boolean
)
2666 (setf handler
(make-recoder handler
#'rod-to-utf8-string
)))
2667 (let* ((xstream (car (zstream-input-stack input
)))
2668 (name (xstream-name xstream
))
2669 (base (when name
(stream-name-uri name
)))
2671 (make-context :handler handler
2673 :entity-resolver entity-resolver
2674 :base-stack
(list (or base
""))
2675 :disallow-internal-subset disallow-internal-subset
))
2676 (*validate
* validate
)
2677 (*namespace-bindings
* *initial-namespace-bindings
*))
2678 (sax:register-sax-parser handler
(make-instance 'cxml-parser
:ctx
*ctx
*))
2679 (sax:start-document handler
)
2680 ;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc*
2681 ;; Misc ::= Comment | PI | S
2682 ;; xmldecl::='<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
2683 ;; sddecl::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
2684 (let ((*data-behaviour
* :DTD
))
2685 ;; optional XMLDecl?
2689 ;; (doctypedecl Misc*)?
2691 ((eq (peek-token input
) :<!DOCTYPE
)
2692 (p/doctype-decl input dtd
)
2695 (synthesize-doctype dtd input
))
2696 ((and validate
(not dtd
))
2697 (validity-error "invalid document: no doctype")))
2699 ;; Override expected root element if asked to
2701 (setf (model-stack *ctx
*) (list (make-root-model root
))))
2703 (let ((*data-behaviour
* :DOC
))
2709 (sax:end-document handler
))))
2711 (defun synthesize-doctype (dtd input
)
2712 (let ((dummy (string->xstream
"<!DOCTYPE dummy>")))
2713 (setf (xstream-name dummy
)
2715 :entity-name
"dummy doctype"
2717 :uri
(zstream-base-sysid input
)))
2718 (with-zstream (zstream :input-stack
(list dummy
))
2719 (p/doctype-decl zstream dtd
))))
2721 (defun fix-seen-< (input)
2722 (when (eq (peek-token input
) :seen-
<)
2723 (multiple-value-bind (c s
)
2724 (read-token-after-|
<| input
(car (zstream-input-stack input
)))
2725 (setf (zstream-token-category input
) c
2726 (zstream-token-semantic input
) s
))))
2728 (defun p/xmldecl
(input)
2729 ;; we will use the attribute-value parser for the xml decl.
2731 (when (eq (peek-token input
) :xml-decl
)
2732 (let ((hd (parse-xml-decl (cdr (nth-value 1 (peek-token input
))))))
2733 (setf (standalone-p *ctx
*) (eq (xml-header-standalone-p hd
) :yes
))
2734 (setup-encoding input hd
)
2737 (set-full-speed input
)))
2739 (defun p/eof
(input)
2740 (unless (eq (peek-token input
) :eof
)
2741 (wf-error input
"Garbage at end of document."))
2743 (maphash (lambda (k v
)
2745 (validity-error "(11) IDREF: ~S not defined" (rod-string k
))))
2748 (dolist (name (referenced-notations *ctx
*))
2749 (unless (find-notation name
(dtd *ctx
*))
2750 (validity-error "(23) Notation Declared: ~S" (rod-string name
))))))
2752 (defun p/element
(input)
2753 (multiple-value-bind (cat n-b new-b uri lname qname attrs
) (p/sztag input
)
2754 (sax:start-element
(handler *ctx
*) uri lname qname attrs
)
2755 (when (eq cat
:stag
)
2756 (let ((*namespace-bindings
* n-b
))
2758 (p/etag input qname
))
2759 (sax:end-element
(handler *ctx
*) uri lname qname
)
2760 (undeclare-namespaces new-b
)
2761 (pop (base-stack *ctx
*))
2762 (validate-end-element *ctx
* qname
)))
2764 (defun p/sztag
(input)
2765 (multiple-value-bind (cat sem
) (read-token input
)
2769 (t (wf-error input
"element expected")))
2770 (destructuring-bind (&optional name
&rest raw-attrs
) sem
2771 (validate-start-element *ctx
* name
)
2773 (process-attributes *ctx
* name
(build-attribute-list raw-attrs
)))
2774 (*namespace-bindings
* *namespace-bindings
*)
2776 (when sax
:*namespace-processing
*
2777 (setf new-namespaces
(declare-namespaces attrs
))
2778 (mapc #'set-attribute-namespace attrs
))
2779 (push (compute-base attrs
) (base-stack *ctx
*))
2780 (multiple-value-bind (uri prefix local-name
)
2781 (if sax
:*namespace-processing
*
2783 (values nil nil nil
))
2784 (declare (ignore prefix
))
2785 (check-attribute-uniqueness attrs
)
2786 (unless (or sax
:*include-xmlns-attributes
*
2787 (null sax
:*namespace-processing
*))
2789 (remove-if (compose #'xmlns-attr-p
#'sax
:attribute-qname
)
2792 *namespace-bindings
*
2794 uri local-name name attrs
))))))
2796 (defun p/etag
(input qname
)
2797 (multiple-value-bind (cat2 sem2
) (read-token input
)
2798 (unless (and (eq cat2
:etag
)
2799 (eq (car sem2
) qname
))
2800 (wf-error input
"Bad nesting. ~S / ~S"
2802 (mu (cons cat2 sem2
))))
2804 (wf-error input
"no attributes allowed in end tag"))))
2806 ;; copy&paste from cxml-rng
2807 (defun escape-uri (string)
2808 (with-output-to-string (out)
2809 (loop for c across
(cxml::rod-to-utf8-string string
) do
2810 (let ((code (char-code c
)))
2811 ;; http://www.w3.org/TR/xlink/#link-locators
2812 (if (or (>= code
127) (<= code
32) (find c
"<>\"{}|\\^`"))
2813 (format out
"%~2,'0X" code
)
2814 (write-char c out
))))))
2816 (defun compute-base (attrs)
2817 (let ((new (sax:find-attribute
#"xml:base" attrs
))
2818 (current (car (base-stack *ctx
*))))
2820 (puri:merge-uris
(escape-uri (sax:attribute-value new
)) current
)
2823 (defun process-characters (input sem
)
2824 (consume-token input
)
2825 (when (search #"]]>" sem
)
2826 (wf-error input
"']]>' not allowed in CharData"))
2827 (validate-characters *ctx
* sem
))
2829 (defun process-cdata-section (input)
2830 (consume-token input
)
2831 (let ((input (car (zstream-input-stack input
))))
2832 (unless (and (rune= #/C
(read-rune input
))
2833 (rune= #/D
(read-rune input
))
2834 (rune= #/A
(read-rune input
))
2835 (rune= #/T
(read-rune input
))
2836 (rune= #/A
(read-rune input
))
2837 (rune= #/\
[ (read-rune input
)))
2838 (wf-error input
"After '<![', 'CDATA[' is expected."))
2839 (validate-characters *ctx
* #"hack") ;anything other than whitespace
2840 (read-cdata-sect input
)))
2842 (defun p/content
(input)
2843 ;; [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)*
2845 (multiple-value-bind (cat sem
) (peek-token input
)
2850 (process-characters input sem
)
2851 (sax:characters
(handler *ctx
*) sem
))
2854 (consume-token input
)
2855 (recurse-on-entity input name
:general
2858 (etypecase (checked-get-entdef name
:general
)
2859 (internal-entdef (p/content input
))
2860 (external-entdef (p/ext-parsed-ent input
)))
2861 (unless (eq (peek-token input
) :eof
)
2862 (wf-error input
"Trailing garbage. - ~S"
2863 (peek-token input
))))))))
2865 (let ((data (process-cdata-section input
)))
2866 (sax:start-cdata
(handler *ctx
*))
2867 (sax:characters
(handler *ctx
*) data
)
2868 (sax:end-cdata
(handler *ctx
*))))
2870 (consume-token input
)
2871 (sax:processing-instruction
(handler *ctx
*) (car sem
) (cdr sem
)))
2873 (consume-token input
)
2874 (sax:comment
(handler *ctx
*) sem
))
2878 ;; [78] extParsedEnt ::= TextDecl? contentw
2879 ;; [79] extPE ::= TextDecl? extSubsetDecl
2881 (defstruct xml-header
2886 (defun p/ext-parsed-ent
(input)
2887 ;; [78] extParsedEnt ::= '<?xml' VersionInfo? EncodingDecl S? '?>' content
2888 (when (eq (peek-token input
) :xml-decl
)
2889 (let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input
))))))
2890 (setup-encoding input hd
))
2891 (consume-token input
))
2892 (set-full-speed input
)
2895 (defun parse-xml-decl (content)
2896 (let* ((res (make-xml-header))
2897 (i (make-rod-xstream content
)))
2898 (with-zstream (z :input-stack
(list i
))
2899 (let ((atts (read-attribute-list z i t
)))
2900 (unless (eq (peek-rune i
) :eof
)
2901 (wf-error i
"Garbage at end of XMLDecl."))
2902 ;; versioninfo muss da sein
2903 ;; dann ? encodingdecl
2906 (unless (eq (caar atts
) (intern-name '#.
(string-rod "version")))
2907 (wf-error i
"XMLDecl needs version."))
2908 (unless (and (>= (length (cdar atts
)) 1)
2910 (or (rune<= #/a x
#/z
)
2918 (wf-error i
"Bad XML version number: ~S." (rod-string (cdar atts
))))
2919 (setf (xml-header-version res
) (rod-string (cdar atts
)))
2921 (when (eq (caar atts
) (intern-name '#.
(string-rod "encoding")))
2922 (unless (and (>= (length (cdar atts
)) 1)
2924 (or (rune<= #/a x
#/z
)
2932 (or (rune<= #/a x
#/z
)
2933 (rune<= #/A x
#/Z
)))
2934 (aref (cdar atts
) 0)))
2935 (wf-error i
"Bad XML encoding name: ~S." (rod-string (cdar atts
))))
2936 (setf (xml-header-encoding res
) (rod-string (cdar atts
)))
2938 (when (eq (caar atts
) (intern-name '#.
(string-rod "standalone")))
2939 (unless (or (rod= (cdar atts
) '#.
(string-rod "yes"))
2940 (rod= (cdar atts
) '#.
(string-rod "no")))
2941 (wf-error i
"XMLDecl's 'standalone' attribute must be exactly \"yes\" or \"no\" and not ~S."
2942 (rod-string (cdar atts
))))
2943 (setf (xml-header-standalone-p res
)
2944 (if (rod-equal '#.
(string-rod "yes") (cdar atts
))
2949 (wf-error i
"Garbage in XMLDecl: ~A" (rod-string content
)))
2952 (defun parse-text-decl (content)
2953 (let* ((res (make-xml-header))
2954 (i (make-rod-xstream content
)))
2955 (with-zstream (z :input-stack
(list i
))
2956 (let ((atts (read-attribute-list z i t
)))
2957 (unless (eq (peek-rune i
) :eof
)
2958 (wf-error i
"Garbage at end of TextDecl"))
2959 ;; versioninfo optional
2960 ;; encodingdecl muss da sein
2962 (when (eq (caar atts
) (intern-name '#.
(string-rod "version")))
2963 (unless (and (>= (length (cdar atts
)) 1)
2965 (or (rune<= #/a x
#/z
)
2973 (wf-error i
"Bad XML version number: ~S." (rod-string (cdar atts
))))
2974 (setf (xml-header-version res
) (rod-string (cdar atts
)))
2976 (unless (eq (caar atts
) (intern-name '#.
(string-rod "encoding")))
2977 (wf-error i
"TextDecl needs encoding."))
2978 (unless (and (>= (length (cdar atts
)) 1)
2980 (or (rune<= #/a x
#/z
)
2988 (or (rune<= #/a x
#/z
)
2990 (rune<= #/0 x
#/9)))
2991 (aref (cdar atts
) 0)))
2992 (wf-error i
"Bad XML encoding name: ~S." (rod-string (cdar atts
))))
2993 (setf (xml-header-encoding res
) (rod-string (cdar atts
)))
2996 (wf-error i
"Garbage in TextDecl: ~A" (rod-string content
)))))
2999 ;;;; ---------------------------------------------------------------------------
3004 (cond ((stringp x
) x
)
3005 ((vectorp x
) (rod-string x
))
3007 (cons (mu (car x
)) (mu (cdr x
))))
3010 ;;;; ---------------------------------------------------------------------------
3011 ;;;; User interface ;;;;
3013 #-cxml-system
::uri-is-namestring
3014 (defun specific-or (component &optional
(alternative nil
))
3015 (if (eq component
:unspecific
)
3019 (defun string-or (str &optional
(alternative nil
))
3020 (if (zerop (length str
))
3024 #-cxml-system
::uri-is-namestring
3025 (defun make-uri (&rest initargs
&key path query
&allow-other-keys
)
3026 (apply #'make-instance
3028 :path
(and path
(escape-path path
))
3029 :query
(and query
(escape-query query
))
3032 #-cxml-system
::uri-is-namestring
3033 (defun escape-path (list)
3034 (puri::render-parsed-path list t
))
3036 #-cxml-system
::uri-is-namestring
3037 (defun escape-query (pairs)
3038 (flet ((escape (str)
3039 (puri::encode-escaped-encoding str puri
::*reserved-characters
* t
)))
3041 (with-output-to-string (s)
3042 (dolist (pair pairs
)
3046 (write-string (escape (car pair
)) s
)
3048 (write-string (escape (cdr pair
)) s
))))))
3050 #-cxml-system
::uri-is-namestring
3051 (defun uri-parsed-query (uri)
3052 (flet ((unescape (str)
3053 (puri::decode-escaped-encoding str t puri
::*reserved-characters
*)))
3054 (let ((str (puri:uri-query uri
)))
3058 (dolist (s (split-sequence-if (lambda (x) (eql x
#\
&)) str
))
3059 (destructuring-bind (name value
)
3060 (split-sequence-if (lambda (x) (eql x
#\
=)) s
)
3061 (push (cons (unescape name
) (unescape value
)) pairs
)))
3066 #-cxml-system
::uri-is-namestring
3067 (defun query-value (name alist
)
3068 (cdr (assoc name alist
:test
#'equal
)))
3070 #-cxml-system
::uri-is-namestring
3071 (defun pathname-to-uri (pathname)
3073 ;; FIXME: should we really leave ".." in base URIs?
3074 (append (mapcar (lambda (x)
3075 (cond ((member x
'(:up
:back
)) "..")
3077 (pathname-directory pathname
))
3079 (if (specific-or (pathname-type pathname
))
3080 (concatenate 'string
3081 (pathname-name pathname
)
3083 (pathname-type pathname
))
3084 (pathname-name pathname
))))))
3085 (if (eq (car path
) :relative
)
3086 (make-uri :path path
)
3087 (make-uri :scheme
:file
3088 :host
(concatenate 'string
3089 (string-or (host-namestring pathname
))
3091 (specific-or (pathname-device pathname
)))
3094 #+cxml-system
::uri-is-namestring
3095 (defun pathname-to-uri (pathname)
3096 (puri:parse-uri
(namestring pathname
)))
3098 #-cxml-system
::uri-is-namestring
3099 (defun parse-name.type
(str)
3101 (let ((i (position #\. str
:from-end t
)))
3103 (values (subseq str
0 i
) (subseq str
(1+ i
)))
3107 #-cxml-system
::uri-is-namestring
3108 (defun uri-to-pathname (uri)
3109 (let ((scheme (puri:uri-scheme uri
))
3110 (path (loop for e in
(puri:uri-parsed-path uri
)
3111 collect
(if (stringp e
)
3112 (puri::decode-escaped-encoding e t nil
)
3114 (unless (member scheme
'(nil :file
))
3115 (error 'xml-parse-error
3116 :format-control
"URI scheme ~S not supported"
3117 :format-arguments
(list scheme
)))
3118 (if (eq (car path
) :relative
)
3119 (multiple-value-bind (name type
)
3120 (parse-name.type
(car (last path
)))
3121 (make-pathname :directory
(butlast path
)
3124 (multiple-value-bind (name type
)
3125 (parse-name.type
(car (last (cdr path
))))
3126 (destructuring-bind (host device
)
3127 (split-sequence-if (lambda (x) (eql x
#\
+))
3128 (or (puri:uri-host uri
) "+"))
3129 (make-pathname :host
(string-or host
)
3130 :device
(string-or device
)
3131 :directory
(cons :absolute
(butlast (cdr path
)))
3134 #+cxml-system
::uri-is-namestring
3135 (defun uri-to-pathname (uri)
3136 (let ((pathname (puri:render-uri uri nil
)))
3137 (when (equalp (pathname-host pathname
) "+")
3138 (setf (slot-value pathname
'lisp
::host
) "localhost"))
3142 (input handler
&rest args
3143 &key validate dtd root entity-resolver disallow-internal-subset
3145 "@arg[input]{A string, pathname, octet vector, or stream.}
3146 @arg[handler]{A @class{SAX handler}}
3147 @arg[validate]{Boolean. Defaults to @code{nil}. If true, parse in
3148 validating mode, i.e. assert that the document contains a DOCTYPE
3149 declaration and conforms to the DTD declared.}
3150 @arg[dtd]{unless @code{nil}, an extid instance specifying the external
3151 subset to load. This options overrides the extid specified in the
3152 document type declaration, if any. See below for @fun{make-extid}.
3153 This option is useful for verification purposes together with the
3154 @var{root} and @var{disallow-internal-subset} arguments.}
3155 @arg[root]{The expected root element name, or @code{nil} (the default).
3156 If specified, this argument overrides the name stated in the input's
3158 @arg[entity-resolver]{@code{nil} or a function of two arguments which
3159 is invoked for every entity referenced by the document with the
3160 entity's Public ID (a rod) and System ID (an URI object) as arguments.
3161 The function may either return nil, CXML will then try to resolve the
3162 entity as usual. Alternatively it may return a Common Lisp stream
3163 specialized on @code{(unsigned-byte 8)} which will be used instead.
3164 (It may also signal an error, of course, which can be useful to prohibit
3165 parsed XML documents from including arbitrary files readable by
3167 @arg[disallow-internal-subset]{Boolean. If true, signal
3168 an error if the document contains an internal subset.}
3169 @arg[recode]{Boolean. (Ignored on Lisps with Unicode
3170 support.) Recode rods to UTF-8 strings. Defaults to true.
3171 Make sure to use @fun{utf8-dom:make-dom-builder} if this
3172 option is enabled and @fun{rune-dom:make-dom-builder}
3174 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3176 Parse an XML document from @var{input}, which can be a string, pathname,
3177 octet vector, or stream.
3179 Return values from this function depend on the SAX handler used.
3180 This is an old-style convenience wrapper around the new-style interface
3183 Parse an XML document from @var{filename}, and signal SAX events to
3184 @var{handler} while doing so.
3186 All SAX parsing functions share the same keyword arguments. Refer to
3187 @fun{parse} for details on keyword arguments."
3188 (declare (ignore validate dtd root entity-resolver disallow-internal-subset
3192 for
(name value
) on args by
#'cddr
3193 unless
(eq name
:pathname
)
3194 append
(list name value
))))
3196 (xstream (apply #'parse-xstream input handler args
))
3197 (pathname (apply #'parse-file input handler args
))
3198 (rod (apply #'parse-rod input handler args
))
3199 (array (apply #'parse-octets input handler args
))
3201 (let ((xstream (make-xstream input
:speed
8192)))
3202 (setf (xstream-name xstream
)
3204 :entity-name
"main document"
3207 (pathname-to-uri (merge-pathnames pathname
))
3208 (safe-stream-sysid input
))))
3209 (apply #'parse-xstream xstream handler args
))))))
3211 (defun parse-xstream (xstream handler
&rest args
)
3214 (with-zstream (zstream :input-stack
(list xstream
))
3216 (with-scratch-pads ()
3217 (apply #'p
/document zstream handler args
)))
3218 (runes-encoding:encoding-error
(c)
3219 (wf-error xstream
"~A" c
)))))
3221 (defun parse-file (filename handler
&rest args
)
3222 "@arg[filename]{An pathname designator.}
3223 @arg[handler]{A @class{SAX handler}}
3224 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3226 This is an old-style convenience wrapper around the new-style interface
3229 Parse an XML document from @var{filename}, and signal SAX events to
3230 @var{handler} while doing so.
3232 All SAX parsing functions share the same keyword arguments. Refer to
3233 @fun{parse} for details on keyword arguments."
3234 (with-open-xfile (input filename
)
3235 (setf (xstream-name input
)
3237 :entity-name
"main document"
3239 :uri
(pathname-to-uri (merge-pathnames filename
))))
3240 (apply #'parse-xstream input handler args
)))
3242 (defun resolve-synonym-stream (stream)
3243 (while (typep stream
'synonym-stream
)
3244 (setf stream
(symbol-value (synonym-stream-symbol stream
))))
3247 (defun safe-stream-sysid (stream)
3248 (if (and (typep (resolve-synonym-stream stream
) 'file-stream
)
3249 ;; ignore-errors, because sb-bsd-sockets creates instances of
3250 ;; FILE-STREAMs that aren't
3251 (ignore-errors (pathname stream
)))
3252 (pathname-to-uri (merge-pathnames (pathname stream
)))
3255 (deftype |SAX HANDLER|
()
3256 'sax
:abstract-handler
3257 "Historically, any object has been usable as a SAX handler with CXML,
3258 as long as it implemented all SAX events, i.e. had methods
3259 for the generic functions defined in the SAX package.
3261 While this approach still works, it is now recommended that SAX handlers
3262 should be implemented by subclassing @class{abstract-handler} or one
3263 of its subclasses. Useful subclasses are @class{content-handler}
3264 and @class{default-handler}.
3266 (In addition, the value @code{nil} is valid SAX handler, which discards
3267 all events it receives.)
3269 As a rule of thumb, write a subclass of @class{default-handler} if
3270 you want to handle only a few special SAX events and ignore the rest,
3271 because this class has no-op default methods for all events.
3273 If, however, you want to make certain that your class implements all
3274 important SAX events explicitly, a good choice is @class{content-handler},
3275 which has no-op default methods only for less important, DTD-related
3276 events, and requires subclasses to implement all events related to the
3279 In some cases, it might be helpful to implement @class{abstract-handler}
3280 directly, which has no default event methods at all.")
3282 (defun parse-stream (stream handler
&rest args
)
3283 "@arg[stream]{An (unsigned-byte 8) stream}
3284 @arg[handler]{A @class{SAX handler}}
3285 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3287 This is an old-style convenience wrapper around the new-style interface
3290 Parse an XML document from @var{stream}, and signal SAX events to
3291 @var{handler} while doing so.
3293 All SAX parsing functions share the same keyword arguments. Refer to
3294 @fun{parse} for details on keyword arguments."
3298 :name
(make-stream-name
3299 :entity-name
"main document"
3301 :uri
(safe-stream-sysid stream
))
3303 (apply #'parse-xstream xstream handler args
)))
3305 (defun parse-empty-document
3306 (uri qname handler
&key public-id system-id entity-resolver
(recode t
))
3307 "@arg[uri]{a string or nil}
3308 @arg[qname]{a string or nil}
3309 @arg[handler]{a @class{SAX handler}}
3310 @arg[public-id]{a string or nil}
3311 @arg[system-id]{a @type{puri:uri} or nil}
3312 @arg[entity-resolver]{@code{nil} or a function of two arguments which
3313 is invoked for every entity referenced by the document with the
3314 entity's Public ID (a rod) and System ID (an URI object) as arguments.
3315 The function may either return nil, CXML will then try to resolve the
3316 entity as usual. Alternatively it may return a Common Lisp stream
3317 specialized on @code{(unsigned-byte 8)} which will be used instead.
3318 (It may also signal an error, of course, which can be useful to prohibit
3319 parsed XML documents from including arbitrary files readable by
3321 @arg[recode]{Boolean. (Ignored on Lisps with Unicode
3322 support.) Recode rods to UTF-8 strings. Defaults to true.
3323 Make sure to use @fun{utf8-dom:make-dom-builder} if this
3324 option is enabled and @fun{rune-dom:make-dom-builder}
3326 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3328 Simulate parsing of a document with a document element @var{qname}
3329 having no attributes except for an optional namespace
3330 declaration to @var{uri}. If an external ID is specified
3331 (@var{system-id}, @var{public-id}), find, parse, and report
3332 this DTD as if with @fun{parse-file}, using the specified
3334 (check-type uri
(or null rod
))
3335 (check-type qname
(or null rod
))
3336 (check-type public-id
(or null rod
))
3337 (check-type system-id
(or null puri
:uri
))
3338 (check-type entity-resolver
(or null function symbol
))
3339 (check-type recode boolean
)
3342 (setf handler
(make-recoder handler
#'rod-to-utf8-string
)))
3344 (make-context :handler handler
:entity-resolver entity-resolver
))
3347 (when (or public-id system-id
)
3348 (extid-using-catalog (make-extid public-id system-id
)))))
3349 (sax:start-document handler
)
3351 (sax:start-dtd handler
3354 (and system-id
(uri-rod system-id
)))
3355 (setf (dtd *ctx
*) (getdtd (extid-system extid
) *dtd-cache
*))
3357 (with-scratch-pads ()
3358 (let ((*data-behaviour
* :DTD
))
3359 (let ((xi2 (xstream-open-extid extid
)))
3360 (with-zstream (zi2 :input-stack
(list xi2
))
3362 (p/ext-subset zi2
))))))
3363 (sax:end-dtd handler
)
3364 (let ((dtd (dtd *ctx
*)))
3365 (sax:entity-resolver handler
(lambda (n h
) (resolve-entity n h dtd
)))
3366 (sax::dtd handler dtd
)))
3368 (when (or uri qname
)
3371 (list (sax:make-attribute
:qname
#"xmlns"
3374 (*namespace-bindings
* *namespace-bindings
*)
3376 (when sax
:*namespace-processing
*
3377 (setf new-namespaces
(declare-namespaces attrs
))
3378 (mapc #'set-attribute-namespace attrs
))
3379 (multiple-value-bind (uri prefix local-name
)
3380 (if sax
:*namespace-processing
* (decode-qname qname
) nil
)
3381 (declare (ignore prefix
))
3382 (unless (or sax
:*include-xmlns-attributes
*
3383 (null sax
:*namespace-processing
*))
3385 (sax:start-element
(handler *ctx
*) uri local-name qname attrs
)
3386 (sax:end-element
(handler *ctx
*) uri local-name qname
))
3387 (undeclare-namespaces new-namespaces
)))
3388 (sax:end-document handler
)))
3390 (defun parse-dtd-file (filename &optional handler
)
3391 "@arg[filename]{An pathname designator.}
3392 @arg[handler]{A @class{SAX handler}}
3393 @return{A @class{dtd} instance.}
3395 Parse @a[http://www.w3.org/TR/2000/REC-xml-20001006#NT-extSubset]{declarations}
3396 from @var{filename} and return an object representing the DTD,
3397 suitable as an argument to @code{validate} with @fun{parse}."
3398 (with-open-file (s filename
:element-type
'(unsigned-byte 8))
3399 (parse-dtd-stream s handler
)))
3401 (defun parse-dtd-stream (stream &optional handler
)
3402 "@arg[stream]{An (unsigned-byte 8) stream.}
3403 @arg[handler]{A @class{SAX handler}}
3404 @return{A @class{dtd} instance.}
3406 Parse @a[http://www.w3.org/TR/2000/REC-xml-20001006#NT-extSubset]{declarations}
3407 from @var{stream} and return an object representing the DTD,
3408 suitable as an argument to @code{validate} with @fun{parse}."
3409 (let ((input (make-xstream stream
)))
3410 (setf (xstream-name input
)
3414 :uri
(safe-stream-sysid stream
)))
3415 (let ((*ctx
* (make-context :handler handler
))
3417 (*data-behaviour
* :DTD
))
3418 (with-zstream (zstream :input-stack
(list input
))
3419 (with-scratch-pads ()
3422 (p/ext-subset zstream
)
3425 (defun parse-rod (string handler
&rest args
)
3426 "@arg[string]{An string of unicode characters.}
3427 @arg[handler]{A @class{SAX handler}}
3428 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3430 This is an old-style convenience wrapper around the new-style interface
3433 Parse an XML document from @var{string}, and signal SAX events to
3434 @var{handler} while doing so.
3436 Note: This function assumes that @var{string} has already been decoded into
3437 Unicode runes and ignores the encoding specified in the XML declaration,
3440 All SAX parsing functions share the same keyword arguments. Refer to
3441 @fun{parse} for details on keyword arguments."
3442 (let ((xstream (string->xstream string
)))
3443 (setf (xstream-name xstream
)
3445 :entity-name
"main document"
3448 (apply #'parse-xstream xstream handler args
)))
3450 (defun string->xstream
(string)
3451 (make-rod-xstream (string-rod string
)))
3453 (defun parse-octets (octets handler
&rest args
)
3454 "@arg[octets]{An (unsigned-byte 8) vector.}
3455 @arg[handler]{A @class{SAX handler}}
3456 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3458 This is an old-style convenience wrapper around the new-style interface
3461 Parse an XML document from @var{octets}, and signal SAX events to
3462 @var{handler} while doing so.
3464 All SAX parsing functions share the same keyword arguments. Refer to
3465 @fun{parse} for details on keyword arguments."
3466 (apply #'parse-stream
(make-octet-input-stream octets
) handler args
))
3470 (defun zstream-push (new-xstream zstream
)
3471 (cond ((find-if (lambda (x)
3473 (eql (stream-name-entity-name (xstream-name x
))
3474 (stream-name-entity-name (xstream-name new-xstream
)))
3475 (eql (stream-name-entity-kind (xstream-name x
))
3476 (stream-name-entity-kind (xstream-name new-xstream
)))))
3477 (zstream-input-stack zstream
))
3478 (wf-error zstream
"Infinite recursion.")))
3479 (push new-xstream
(zstream-input-stack zstream
))
3482 (defun recurse-on-entity (zstream name kind continuation
&optional internalp
)
3483 (assert (not (zstream-token-category zstream
)))
3484 (call-with-entity-expansion-as-stream
3486 (lambda (new-xstream)
3487 (push :stop
(zstream-input-stack zstream
))
3488 (zstream-push new-xstream zstream
)
3490 (funcall continuation zstream
)
3491 (assert (eq (peek-token zstream
) :eof
))
3492 (assert (eq (pop (zstream-input-stack zstream
)) new-xstream
))
3493 (close-xstream new-xstream
)
3494 (assert (eq (pop (zstream-input-stack zstream
)) :stop
))
3495 (setf (zstream-token-category zstream
) nil
)
3496 '(consume-token zstream
)) )
3502 (defmacro read-data-until
* ((predicate input res res-start res-end
) &body body
)
3503 ;; fast variant -- for now disabled for no apparent reason
3504 ;; -> res, res-start, res-end
3505 `(let* ((rptr (xstream-read-ptr ,input
))
3507 (fptr (xstream-fill-ptr ,input
))
3508 (buf (xstream-buffer ,input
))
3509 ,res
,res-start
,res-end
)
3510 (declare (type fixnum rptr fptr p0
)
3511 (type (simple-array read-element
(*)) buf
))
3513 (cond ((%
= rptr fptr
)
3514 ;; underflow -- hmm inject the scratch-pad with what we
3515 ;; read and continue, while using read-rune and collecting
3516 ;; d.h. besser waere hier auch while-reading zu benutzen.
3517 (setf (xstream-read-ptr ,input
) rptr
)
3518 (multiple-value-setq (,res
,res-start
,res-end
)
3519 (with-rune-collector/raw
(collect)
3520 (do ((i p0
(%
+ i
1)))
3522 (collect (%rune buf i
)))
3525 (cond ((%
= rptr fptr
)
3526 (setf (xstream-read-ptr ,input
) rptr
)
3527 (setf c
(peek-rune input
))
3530 (setf rptr
(xstream-read-ptr ,input
)
3531 fptr
(xstream-fill-ptr ,input
)
3532 buf
(xstream-buffer ,input
)))
3534 (setf c
(%rune buf rptr
))))
3535 (cond ((,predicate c
)
3537 (setf (xstream-read-ptr ,input
) rptr
)
3542 (setf rptr
(%
+ rptr
1))) )))))
3544 ((,predicate
(%rune buf rptr
))
3546 (setf (xstream-read-ptr ,input
) rptr
)
3547 (setf ,res buf
,res-start p0
,res-end rptr
)
3551 (sf rptr
(%
+ rptr
1))) ))
3555 (defmacro read-data-until
* ((predicate input res res-start res-end
) &body body
)
3556 "Read data from `input' until `predicate' applied to the read char
3557 turns true. Then execute `body' with `res', `res-start', `res-end'
3558 bound to denote a subsequence (of RUNEs) containing the read portion.
3559 The rune upon which `predicate' turned true is neither consumed from
3560 the stream, nor included in `res'.
3562 Keep the predicate short, this it may be included more than once into
3563 the macro's expansion."
3565 (let ((input-var (gensym))
3568 `(LET ((,input-var
,input
))
3569 (MULTIPLE-VALUE-BIND (,res
,res-start
,res-end
)
3570 (WITH-RUNE-COLLECTOR/RAW
(,collect
)
3572 (LET ((,c
(PEEK-RUNE ,input-var
)))
3574 ;; xxx error message
3576 ((FUNCALL ,predicate
,c
)
3580 (CONSUME-RUNE ,input-var
))))))
3584 (defun read-name-token (input)
3585 (read-data-until* ((lambda (rune)
3586 (declare (type rune rune
))
3587 (not (name-rune-p rune
)))
3590 (intern-name r rs re
)))
3592 (defun read-cdata (input)
3593 (read-data-until* ((lambda (rune)
3594 (declare (type rune rune
))
3595 (when (and (%rune
< rune
#/U
+0020)
3596 (not (or (%rune
= rune
#/U
+0009)
3597 (%rune
= rune
#/U
+000a
)
3598 (%rune
= rune
#/U
+000d
))))
3599 (wf-error input
"code point invalid: ~A" rune
))
3600 (or (%rune
= rune
#/<) (%rune
= rune
#/&)))
3604 (declare (type (simple-array rune
(*)) source
)
3605 (type ufixnum start
)
3607 (optimize (speed 3) (safety 0)))
3608 (let ((res (make-array (%- end start
) :element-type
'rune
)))
3609 (declare (type (simple-array rune
(*)) res
))
3610 (let ((i (%- end start
)))
3611 (declare (type ufixnum i
))
3614 (setf (%rune res i
) (%rune source
(the ufixnum
(+ i start
))))
3619 ;; used only by read-att-value-2
3620 (defun internal-entity-expansion (name)
3621 (let ((def (get-entity-definition name
:general
(dtd *ctx
*))))
3623 (wf-error nil
"Entity '~A' is not defined." (rod-string name
)))
3624 (unless (typep def
'internal-entdef
)
3625 (wf-error nil
"Entity '~A' is not an internal entity." name
))
3626 (or (entdef-expansion def
)
3627 (setf (entdef-expansion def
) (find-internal-entity-expansion name
)))))
3629 ;; used only by read-att-value-2
3630 (defun find-internal-entity-expansion (name)
3631 (with-zstream (zinput)
3632 (with-rune-collector-3 (collect)
3633 (labels ((muffle (input)
3636 (setf c
(read-rune input
))
3640 (setf c
(peek-rune input
))
3644 (let ((c (read-character-reference input
)))
3645 (%put-unicode-char c collect
)))
3647 (unless (name-start-rune-p c
)
3648 (wf-error zinput
"Expecting name after &."))
3649 (let ((name (read-name-token input
)))
3650 (setf c
(read-rune input
))
3651 (check-rune input c
#/\
;)
3653 zinput name
:general
3655 (muffle (car (zstream-input-stack zinput
)))))))))
3657 (wf-error zinput
"unexpected #\/<"))
3660 ((not (data-rune-p c
))
3661 (wf-error zinput
"illegal char: ~S." c
))
3664 (declare (dynamic-extent #'muffle
))
3666 zinput name
:general
3668 (muffle (car (zstream-input-stack zinput
)))))))))
3671 (defun resolve-entity (name handler dtd
)
3672 (let ((*validate
* nil
))
3673 (if (get-entity-definition name
:general dtd
)
3674 (let* ((*ctx
* (make-context :handler handler
:dtd dtd
))
3675 (*data-behaviour
* :DOC
))
3676 (with-zstream (input)
3677 (with-scratch-pads ()
3682 (etypecase (checked-get-entdef name
:general
)
3683 (internal-entdef (p/content input
))
3684 (external-entdef (p/ext-parsed-ent input
)))
3685 (unless (eq (peek-token input
) :eof
)
3686 (wf-error input
"Trailing garbage. - ~S"
3687 (peek-token input
)))))))))
3690 (defun read-att-value-2 (input)
3691 (let ((delim (read-rune input
)))
3692 (when (eql delim
:eof
)
3694 (unless (member delim
'(#/\" #/\') :test
#'eql
)
3696 "Bad attribute value delimiter ~S, must be either #\\\" or #\\\'."
3698 (with-rune-collector-4 (collect)
3700 (let ((c (read-rune input
)))
3706 (wf-error input
"'<' not allowed in attribute values"))
3708 (multiple-value-bind (kind sem
) (read-entity-like input
)
3710 (:CHARACTER-REFERENCE
3711 (%put-unicode-char sem collect
))
3713 (let* ((exp (internal-entity-expansion sem
))
3715 (declare (type (simple-array rune
(*)) exp
))
3716 (do ((i 0 (%
+ i
1)))
3718 (collect (%rune exp i
))))))))
3728 ;; We already know that name is part of a valid XML name, so all we
3729 ;; have to check is that the first rune is a name-start-rune and that
3730 ;; there is not colon in it.
3731 (defun nc-name-p (name)
3732 (and (plusp (length name
))
3733 (name-start-rune-p (rune name
0))
3734 (notany #'(lambda (rune) (rune= #/: rune
)) name
)))
3736 (defun split-qname (qname)
3737 (declare (type runes
:simple-rod qname
))
3738 (let ((pos (position #/: qname
)))
3740 (let ((prefix (subseq qname
0 pos
))
3741 (local-name (subseq qname
(1+ pos
))))
3743 (wf-error nil
"empty namespace prefix"))
3744 (if (nc-name-p local-name
)
3745 (values prefix local-name
)
3746 (wf-error nil
"~S is not a valid NcName."
3747 (rod-string local-name
))))
3748 (values () qname
))))
3750 (defun decode-qname (qname)
3751 "decode-qname name => namespace-uri, prefix, local-name"
3752 (declare (type runes
:simple-rod qname
))
3753 (multiple-value-bind (prefix local-name
) (split-qname qname
)
3754 (let ((uri (find-namespace-binding prefix
)))
3756 (values uri prefix local-name
)
3757 (values nil nil qname
)))))
3760 (defun find-namespace-binding (prefix)
3761 (cdr (or (assoc (or prefix
#"") *namespace-bindings
* :test
#'rod
=)
3762 (wf-error nil
"Undeclared namespace prefix: ~A" (rod-string prefix
)))))
3764 ;; FIXME: Should probably be refactored by adding :start and :end to rod=/rod-equal
3765 (defun rod-starts-with (prefix rod
)
3766 (and (<= (length prefix
) (length rod
))
3767 (dotimes (i (length prefix
) t
)
3768 (unless (rune= (rune prefix i
) (rune rod i
))
3771 (defun xmlns-attr-p (attr-name)
3772 (rod-starts-with #.
(string-rod "xmlns") attr-name
))
3774 (defun attrname->prefix
(attrname)
3775 (if (< 5 (length attrname
))
3779 (defun find-namespace-declarations (attributes)
3781 for attribute in attributes
3782 for qname
= (sax:attribute-qname attribute
)
3783 when
(xmlns-attr-p qname
)
3784 collect
(cons (attrname->prefix qname
) (sax:attribute-value attribute
))))
3786 (defun declare-namespaces (attributes)
3787 (let ((ns-decls (find-namespace-declarations attributes
)))
3788 (dolist (ns-decl ns-decls
)
3789 ;; check some namespace validity constraints
3790 (let ((prefix (car ns-decl
))
3791 (uri (cdr ns-decl
)))
3793 ((and (rod= prefix
#"xml")
3794 (not (rod= uri
#"http://www.w3.org/XML/1998/namespace")))
3796 "Attempt to rebind the prefix \"xml\" to ~S." (mu uri
)))
3797 ((and (rod= uri
#"http://www.w3.org/XML/1998/namespace")
3798 (not (rod= prefix
#"xml")))
3801 URI \"http://www.w3.org/XML/1998/namespace\" may not ~
3802 be bound to the prefix ~S, only \"xml\" is legal."
3804 ((and (rod= prefix
#"xmlns")
3805 (rod= uri
#"http://www.w3.org/2000/xmlns/"))
3807 "Attempt to bind the prefix \"xmlns\" to its predefined ~
3808 URI \"http://www.w3.org/2000/xmlns/\", which is ~
3809 forbidden for no good reason."))
3810 ((rod= prefix
#"xmlns")
3812 "Attempt to bind the prefix \"xmlns\" to the URI ~S, ~
3813 but it may not be declared." (mu uri
)))
3814 ((rod= uri
#"http://www.w3.org/2000/xmlns/")
3816 "The namespace URI \"http://www.w3.org/2000/xmlns/\" may ~
3817 not be bound to prefix ~S (or any other)." (mu prefix
)))
3818 ((and (rod= uri
#"") prefix
)
3820 "Only the default namespace (the one without a prefix) ~
3821 may be bound to an empty namespace URI, thus ~
3824 (push (cons prefix
(if (rod= #"" uri
) nil uri
))
3825 *namespace-bindings
*)
3826 (sax:start-prefix-mapping
(handler *ctx
*)
3831 (defun undeclare-namespaces (ns-decls)
3832 (dolist (ns-decl ns-decls
)
3833 (sax:end-prefix-mapping
(handler *ctx
*) (car ns-decl
))))
3835 (defun build-attribute-list (attr-alist)
3836 ;; fixme: if there is a reason this function reverses attribute order,
3837 ;; it should be documented.
3839 (dolist (pair attr-alist
)
3840 (push (sax:make-attribute
:qname
(car pair
)
3846 (defun check-attribute-uniqueness (attributes)
3847 ;; 5.3 Uniqueness of Attributes
3848 ;; In XML documents conforming to [the xmlns] specification, no
3849 ;; tag may contain two attributes which:
3850 ;; 1. have identical names, or
3851 ;; 2. have qualified names with the same local part and with
3852 ;; prefixes which have been bound to namespace names that are
3855 ;; 1. is checked by read-tag-2, so we only deal with 2 here
3856 (loop for
(attr-1 . rest
) on attributes do
3857 (when (and (sax:attribute-namespace-uri attr-1
)
3858 (find-if (lambda (attr-2)
3859 (and (rod= (sax:attribute-namespace-uri attr-1
)
3860 (sax:attribute-namespace-uri attr-2
))
3861 (rod= (sax:attribute-local-name attr-1
)
3862 (sax:attribute-local-name attr-2
))))
3865 "Multiple definitions of attribute ~S in namespace ~S."
3866 (mu (sax:attribute-local-name attr-1
))
3867 (mu (sax:attribute-namespace-uri attr-1
))))))
3869 (defun set-attribute-namespace (attribute)
3870 (let ((qname (sax:attribute-qname attribute
)))
3871 (if (and sax
:*use-xmlns-namespace
* (rod= qname
#"xmlns"))
3872 (setf (sax:attribute-namespace-uri attribute
)
3873 #"http://www.w3.org/2000/xmlns/")
3874 (multiple-value-bind (prefix local-name
) (split-qname qname
)
3875 (when (and prefix
;; default namespace doesn't apply to attributes
3876 (or (not (rod= #"xmlns" prefix
))
3877 sax
:*use-xmlns-namespace
*))
3878 (setf (sax:attribute-namespace-uri attribute
)
3879 (decode-qname qname
)))
3880 (setf (sax:attribute-local-name attribute
) local-name
)))))
3884 ;; System Identifier Protocol
3886 ;; A system identifier is an object obeying to the system identifier
3887 ;; protocol. Often something like an URL or a pathname.
3889 ;; OPEN-SYS-ID sys-id [generic function]
3891 ;; Opens the resource associated with the system identifier `sys-id'
3892 ;; for reading and returns a stream. For now it is expected, that the
3893 ;; stream is an octet stream (one of element type (unsigned-byte 8)).
3895 ;; More precisely: The returned object only has to obey to the xstream
3896 ;; controller protocol. (That is it has to provide implementations for
3897 ;; READ-OCTETS and XSTREAM-CONTROLLER-CLOSE).
3899 ;; MERGE-SYS-ID sys-id base [generic function]
3901 ;; Merges two system identifiers. That is resolve `sys-id' relative to
3902 ;; `base' yielding an absolute system identifier suitable for
3907 ;;; SAX validation handler
3909 (defclass validator
()
3910 ((context :initarg
:context
:accessor context
)
3911 (cdatap :initform nil
:accessor cdatap
)))
3913 (defun make-validator (dtd root
)
3914 "@arg[dtd]{An @class{dtd} instance.}
3915 @arg[root]{Element name, a string.}
3916 @return{A @class{SAX handler}.}
3918 Create a SAX handler which validates against a DTD instance.
3919 The document's root element must be named @code{root}.
3920 Used with @fun{dom:map-document}, this validates a document
3921 object as if by re-reading it with a validating parser, except
3922 that declarations recorded in the document instance are completely
3927 @pre{(let ((d (parse-file \"~/test.xml\" (cxml-dom:make-dom-builder)))
3928 (x (parse-dtd-file \"~/test.dtd\")))
3929 (dom:map-document (cxml:make-validator x #\"foo\") d))}"
3930 (make-instance 'validator
3931 :context
(make-context
3934 :model-stack
(list (make-root-model root
)))))
3936 (macrolet ((with-context ((validator) &body body
)
3937 `(let ((*ctx
* (context ,validator
))
3939 (with-scratch-pads () ;nicht schoen
3941 (defmethod sax:start-element
((handler validator
) uri lname qname attributes
)
3943 (with-context (handler)
3944 (validate-start-element *ctx
* qname
)
3945 (process-attributes *ctx
* qname attributes
)))
3947 (defmethod sax:start-cdata
((handler validator
))
3948 (setf (cdatap handler
) t
))
3950 (defmethod sax:characters
((handler validator
) data
)
3951 (with-context (handler)
3952 (validate-characters *ctx
* (if (cdatap handler
) #"hack" data
))))
3954 (defmethod sax:end-cdata
((handler validator
))
3955 (setf (cdatap handler
) nil
))
3957 (defmethod sax:end-element
((handler validator
) uri lname qname
)
3959 (with-context (handler)
3960 (validate-end-element *ctx
* qname
))))