More encoding tweaks
[xuriella.git] / space.lisp
blob9cebdc5bbefd9d6b4ce810bf61a7a13cf3805644
1 ;;; -*- show-trailing-whitespace: t; indent-tabs-mode: nil -*-
3 ;;; Copyright (c) 2007,2008 David Lichteblau, Ivan Shvedunov.
4 ;;; All rights reserved.
6 ;;; Redistribution and use in source and binary forms, with or without
7 ;;; modification, are permitted provided that the following conditions
8 ;;; are met:
9 ;;;
10 ;;; * Redistributions of source code must retain the above copyright
11 ;;; notice, this list of conditions and the following disclaimer.
12 ;;;
13 ;;; * Redistributions in binary form must reproduce the above
14 ;;; copyright notice, this list of conditions and the following
15 ;;; disclaimer in the documentation and/or other materials
16 ;;; provided with the distribution.
17 ;;;
18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 ;;; This file implements whitespace stripping.
32 ;;;
33 ;;; Although the spec presents a unified algorithm for whitespace stripping
34 ;;; of stylesheets and source documents, we implement them separately.
35 ;;;
36 ;;; For stylesheets, the STP parse tree of the stylesheet is modified
37 ;;; directly according the its xml:space declarations and xsl:text elements.
38 ;;;
39 ;;; For source documents, the strip-space and preserve-space declarations
40 ;;; from the stylesheet are taken into account. To avoid processing
41 ;;; parts of the document that XPath would not otherwise have navigated
42 ;;; to, we do whitespace stripping lazily using a proxy implementation
43 ;;; of the XPath protocol.
45 (in-package :xuriella)
47 #+sbcl
48 (declaim (optimize (debug 2)))
51 ;;;; Helper functions
53 (eval-when (:compile-toplevel :load-toplevel :execute)
54 (defparameter *whitespace*
55 (format nil "~C~C~C~C"
56 (code-char 9)
57 (code-char 32)
58 (code-char 13)
59 (code-char 10))))
61 (defun normalize-whitespace (str)
62 (cl-ppcre:regex-replace-all #.(format nil "[~A]+" *whitespace*)
63 (string-trim *whitespace* str)
64 " "))
66 (defun whitespacep (str)
67 (cl-ppcre:all-matches #.(format nil "^[~A]+$" *whitespace*) str))
69 (defun words (str)
70 (when str
71 (cl-ppcre:split #.(format nil "[~A]+" *whitespace*)
72 (string-trim *whitespace* str))))
75 ;;;; Strip whitespace in stylesheets
77 ;; Also strips comments and PIs.
78 (defun strip-stylesheet (parent &optional preserve)
79 (let ((i 0))
80 (loop while (< i (length (cxml-stp-impl::%children parent))) do
81 (let ((child (stp:nth-child i parent)))
82 (etypecase child
83 (stp:text
84 (if (and (whitespacep (stp:data child))
85 (not preserve))
86 (stp:delete-nth-child i parent)
87 (incf i)))
88 ((or stp:comment stp:processing-instruction)
89 (stp:delete-nth-child i parent))
90 (stp:element
91 (stp:with-attributes ((space "space" *xml*))
92 child
93 (let ((new-preserve
94 (cond
95 ((namep child "text") t)
96 ((not space) preserve)
97 ((equal space "preserve") t)
98 (t nil))))
99 (strip-stylesheet child new-preserve)))
100 (incf i)))))))
103 ;;;; Strip whitespace in source documents
105 (defun make-whitespace-stripper (node tests)
106 (if tests
107 (make-stripping-node nil node tests nil)
108 node))
110 (defstruct (stripping-node (:constructor #:ignore))
111 parent
112 target
113 children)
115 (defstruct (leaf-stripping-node
116 (:constructor make-leaf-stripping-node (parent target))
117 (:include stripping-node)))
119 (defstruct (parent-stripping-node
120 (:constructor make-parent-stripping-node (parent target))
121 (:include stripping-node)))
123 (defmethod print-object ((object stripping-node) stream)
124 (print-unreadable-object (object stream :type t :identity nil)
125 (let ((target (write-to-string (stripping-node-target object))))
126 (if (and (alexandria:starts-with-subseq target "#<")
127 (alexandria:ends-with #\> target))
128 (write-sequence target stream :start 3 :end (1- (length target)))
129 (write-sequence target stream)))))
131 (defun strip-under-qname-p (node tests)
132 (let ((local-name (xpath-protocol:local-name node))
133 (uri (xpath-protocol:namespace-uri node)))
134 (dolist (test tests nil)
135 (let ((result (funcall test local-name uri)))
136 (when result
137 (return (eq result :strip)))))))
139 (defun xpath-protocol/attribute-value (node local-name uri)
140 (do-pipe (a (xpath-protocol:attribute-pipe node))
141 (when (and (equal (xpath-protocol:local-name a) local-name)
142 (equal (xpath-protocol:namespace-uri a) uri))
143 (return (xpath-protocol:node-text a)))))
145 (defun make-stripping-node (parent target tests force-preserve)
146 (let ((result (make-parent-stripping-node parent target))
147 (xml-space (xpath-protocol/attribute-value target "space" *xml*)))
148 (when xml-space
149 (setf force-preserve (equal xml-space "preserve")))
150 (labels ((recurse (child-node)
151 (if (xpath-protocol:node-type-p child-node :element)
152 (make-stripping-node result child-node tests force-preserve)
153 (make-leaf-stripping-node result child-node)))
154 (maybe-recurse (child-node)
155 (if (and (xpath-protocol:node-type-p child-node :text)
156 (whitespacep (xpath-protocol:node-text child-node)))
158 (recurse child-node))))
159 (let ((all-children (xpath-protocol:child-pipe target)))
160 (setf (stripping-node-children result)
161 (if (or force-preserve
162 (not (xpath-protocol:node-type-p target :element))
163 (not (strip-under-qname-p target tests)))
164 (xpath::map-pipe-filtering #'recurse all-children)
165 (xpath::map-pipe-filtering #'maybe-recurse all-children)))))
166 result))
168 (macrolet ((defproxy (name &rest args)
169 `(define-default-method ,name ((node stripping-node) ,@args)
170 (,name (stripping-node-target node) ,@args))))
171 (defproxy xpath-protocol:local-name)
172 (defproxy xpath-protocol:namespace-uri)
173 (defproxy xpath-protocol:namespace-prefix)
174 (defproxy xpath-protocol:qualified-name)
175 (defproxy xpath-protocol:node-type-p type))
177 (define-default-method xpath-protocol:node-equal
178 ((a stripping-node) (b stripping-node))
179 (xpath-protocol:node-equal (stripping-node-target a)
180 (stripping-node-target b)))
182 (define-default-method xpath-protocol:hash-key ((node stripping-node))
183 (xpath-protocol:hash-key (stripping-node-target node)))
185 (define-default-method xpath-protocol:attribute-pipe ((node stripping-node))
186 (xpath::map-pipe (lambda (attribute)
187 (make-leaf-stripping-node node attribute))
188 (xpath-protocol:attribute-pipe
189 (stripping-node-target node))))
191 (define-default-method xpath-protocol:namespace-pipe ((node stripping-node))
192 (xpath::map-pipe (lambda (namespace)
193 (make-leaf-stripping-node node namespace))
194 (xpath-protocol:namespace-pipe
195 (stripping-node-target node))))
197 (define-default-method xpath-protocol:node-p ((node stripping-node))
200 (define-default-method xpath-protocol:child-pipe ((node stripping-node))
201 (stripping-node-children node))
203 (define-default-method xpath-protocol:parent-node ((node stripping-node))
204 (stripping-node-parent node))
206 (define-default-method xpath-protocol:node-text ((node stripping-node))
207 (with-output-to-string (s)
208 (write-string-value node s)))
210 (defmethod write-string-value ((node parent-stripping-node) stream)
211 (do-pipe (child (xpath-protocol:child-pipe node))
212 (unless (or (xpath-protocol:node-type-p child :comment)
213 (xpath-protocol:node-type-p child :processing-instruction))
214 (write-string-value child stream))))
216 (defmethod write-string-value ((node leaf-stripping-node) stream)
217 (write-string-value (stripping-node-target node) stream))
219 (defmethod write-string-value (node stream)
220 (write-string (xpath-protocol:node-text node) stream))
222 (define-default-method xpath-protocol:get-element-by-id
223 ((node stripping-node) id)
224 (let ((target
225 (xpath-protocol:get-element-by-id (stripping-node-target node) id)))
226 (when target
227 (let ((stripping-root
228 (loop
229 for parent = node then next
230 for next = (stripping-node-parent parent)
231 while next
232 finally (return parent)))
233 (target-path nil))
234 (loop
235 for parent = target then next
236 for next = (xpath-protocol:parent-node parent)
237 while next
238 do (push parent target-path))
239 (labels ((find-child (stripping-parent target-child)
240 (xpath::find-in-pipe target-child
241 (xpath-protocol:child-pipe
242 stripping-parent)
243 :key #'stripping-node-target))
244 (resolve-path (stripping-parent target-path)
245 (if target-path
246 (let ((step
247 (find-child stripping-parent (car target-path))))
248 (if step
249 (resolve-path step (cdr target-path))
250 nil))
251 stripping-parent)))
252 (resolve-path stripping-root target-path))))))
254 (define-default-method xpath-protocol:unparsed-entity-uri
255 ((node stripping-node) name)
256 (xpath-protocol:unparsed-entity-uri (stripping-node-target node) name))
259 ;;;; TEXT NORMALIZER, from cxml-rng
261 ;;; FIXME: cxml should do that
263 (defun make-text-normalizer (next)
264 (make-instance 'text-normalizer :chained-handler next))
266 (defclass text-normalizer (cxml:sax-proxy)
267 ((pending-text-node :initform (make-string-output-stream)
268 :accessor pending-text-node)))
270 (defmethod sax:characters ((handler text-normalizer) data)
271 (write-string data (pending-text-node handler)))
273 (defun flush-pending (handler)
274 (let ((str (get-output-stream-string (pending-text-node handler))))
275 (unless (zerop (length str))
276 (sax:characters (cxml:proxy-chained-handler handler) str))))
278 (defmethod sax:start-element :before
279 ((handler text-normalizer) uri lname qname attributes)
280 (declare (ignore uri lname qname attributes))
281 (flush-pending handler))
283 (defmethod sax:end-element :before
284 ((handler text-normalizer) uri lname qname)
285 (declare (ignore uri lname qname))
286 (flush-pending handler))