3 (defun make-multibyte-mapper (list)
4 (let ((list (sort (copy-list list
) #'< :key
#'car
))
5 (hi (loop for x in list maximize
(max (car x
) (cadr x
)))))
6 (make-array (list (length list
) 2)
7 :element-type
(list 'integer
0 hi
)
8 :initial-contents list
)))
10 (defmacro define-multibyte-mapper
(name list
)
12 (make-multibyte-mapper ,list
)))
14 (defun get-multibyte-mapper (table code
)
15 (declare (optimize speed
(safety 0))
16 (type (array * (* 2)) table
)
18 (labels ((recur (start end
)
19 (declare (type fixnum start end
))
20 (let* ((m (ash (+ start end
) -
1))
22 (declare (type fixnum m x
))
25 ((and (< x code
) (< m end
))
27 ((and (> x code
) (> m start
))
28 (recur start
(1- m
)))))))
29 (recur 0 (1- (array-dimension table
0)))))
31 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
32 ;; FIXME: better to change make-od-name() to accept multiple
33 ;; arguments in octets.lisp?
34 (defun make-od-name-list (&rest syms
)
35 (reduce #'make-od-name syms
))
37 (defun define-bytes-per-mb-character-1 (accessor type format
38 mb-len mb-continuation-byte-p
)
39 (let ((name (make-od-name-list 'bytes-per format
'character accessor
))
40 (invalid-mb-starter-byte
41 (make-od-name-list 'invalid format
'starter-byte
))
42 (invalid-mb-continuation-byte
43 (make-od-name-list 'invalid format
'continuation-byte
)))
45 ;;(declaim (inline ,name))
46 (defun ,name
(array pos end
)
47 (declare (optimize speed
(safety 0))
49 (type array-range pos end
))
50 ;; returns the number of bytes consumed and nil if it's a
51 ;; valid character or the number of bytes consumed and a
52 ;; replacement string if it's not.
53 (let ((initial-byte (,accessor array pos
))
56 (remaining-bytes (- end pos
)))
57 (declare (type array-range reject-position remaining-bytes
))
58 (labels ((valid-starter-byte-p (b)
59 (declare (type (unsigned-byte 8) b
))
60 (let ((ok (,mb-len b
)))
62 (setf reject-reason
',invalid-mb-starter-byte
))
64 (enough-bytes-left-p (x)
65 (let ((ok (> end
(+ pos
(1- x
)))))
67 (setf reject-reason
'end-of-input-in-character
))
69 (valid-secondary-p (x)
70 (let* ((idx (the array-range
(+ pos x
)))
71 (b (,accessor array idx
))
72 (ok (,mb-continuation-byte-p b
)))
74 (setf reject-reason
',invalid-mb-continuation-byte
)
75 (setf reject-position idx
))
77 (preliminary-ok-for-length (maybe-len len
)
78 (and (eql maybe-len len
)
79 ;; Has to be done in this order so that
80 ;; certain broken sequences (e.g., the
81 ;; two-byte sequence `"initial (length 3)"
82 ;; "non-continuation"' -- `#xef #x32')
83 ;; signal only part of that sequence as
85 (loop for i from
1 below
(min len remaining-bytes
)
86 always
(valid-secondary-p i
))
87 (enough-bytes-left-p len
))))
88 (declare (inline valid-starter-byte-p
91 preliminary-ok-for-length
))
92 (let ((maybe-len (valid-starter-byte-p initial-byte
)))
93 (cond ((eql maybe-len
1)
95 ((preliminary-ok-for-length maybe-len
2)
97 ((preliminary-ok-for-length maybe-len
3)
100 (let* ((bad-end (ecase reject-reason
101 (,invalid-mb-starter-byte
103 (end-of-input-in-character
105 (,invalid-mb-continuation-byte
107 (bad-len (- bad-end pos
)))
108 (declare (type array-range bad-end bad-len
))
109 (let ((replacement (decoding-error array pos bad-end
,format reject-reason reject-position
)))
110 (values bad-len replacement
))))))))))))
112 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs
)
113 (let ((name (make-od-name-list 'simple-get format
'char accessor
))
114 (malformed (make-od-name 'malformed format
)))
116 (declaim (inline ,name
))
117 (defun ,name
(array pos bytes
)
118 (declare (optimize speed
(safety 0))
120 (type array-range pos
)
121 (type (integer 1 3) bytes
))
123 (,accessor array
(the array-range
(+ pos x
)))))
124 (declare (inline cref
))
125 (let ((code (,mb-to-ucs
(ecase bytes
127 (2 (logior (ash (cref 0) 8) (cref 1)))
128 (3 (logior (ash (cref 0) 16)
133 (decoding-error array pos
(+ pos bytes
) ,format
134 ',malformed pos
))))))))
136 (defun define-mb->string-1
(accessor type format
)
138 (make-od-name-list format
'>string accessor
))
139 (bytes-per-mb-character
140 (make-od-name-list 'bytes-per format
'character accessor
))
142 (make-od-name-list 'simple-get format
'char accessor
)))
144 (defun ,name
(array astart aend
)
145 (declare (optimize speed
(safety 0))
147 (type array-range astart aend
))
148 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
149 (loop with pos
= astart
151 do
(multiple-value-bind (bytes invalid
)
152 (,bytes-per-mb-character array pos aend
)
153 (declare (type (or null string
) invalid
))
156 (vector-push-extend (,simple-get-mb-char array pos bytes
) string
))
158 (dotimes (i (length invalid
))
159 (vector-push-extend (char invalid i
) string
))))
161 (coerce string
'simple-string
))))))
163 (declaim (inline mb-char-len
))
164 (defun mb-char-len (code)
165 (declare (optimize speed
(safety 0))
167 (cond ((< code
0) (bug "can't happen"))
170 ((< code
#x1000000
) 3)
171 (t (bug "can't happen"))))
174 (defmacro define-multibyte-encoding
(format aliases
176 mb-len mb-continuation-byte-p
)
177 (let ((char->mb
(make-od-name 'char-
> format
))
178 (string->mb
(make-od-name 'string-
> format
))
179 (define-bytes-per-mb-character
180 (make-od-name-list 'define-bytes-per format
'character
))
181 (define-simple-get-mb-char
182 (make-od-name-list 'define-simple-get format
'char
))
184 (make-od-name-list 'define format
'>string
)))
186 ;; for fd-stream.lisp
187 (define-external-format/variable-width
,aliases t
188 (mb-char-len (or (,ucs-to-mb
(char-code byte
)) -
1))
189 (let ((mb (,ucs-to-mb bits
)))
191 (external-format-encoding-error stream byte
)
193 (1 (setf (sap-ref-8 sap tail
) mb
))
194 (2 (setf (sap-ref-8 sap tail
) (ldb (byte 8 8) mb
)
195 (sap-ref-8 sap
(1+ tail
)) (ldb (byte 8 0) mb
)))
196 (3 (setf (sap-ref-8 sap tail
) (ldb (byte 8 16) mb
)
197 (sap-ref-8 sap
(1+ tail
)) (ldb (byte 8 8) mb
)
198 (sap-ref-8 sap
(+ 2 tail
)) (ldb (byte 8 0) mb
))))))
200 (let* ((mb (ecase size
202 (2 (let ((byte2 (sap-ref-8 sap
(1+ head
))))
203 (unless (,mb-continuation-byte-p byte2
)
204 (return-from decode-break-reason
2))
205 (dpb byte
(byte 8 8) byte2
)))
206 (3 (let ((byte2 (sap-ref-8 sap
(1+ head
)))
207 (byte3 (sap-ref-8 sap
(+ 2 head
))))
208 (unless (,mb-continuation-byte-p byte2
)
209 (return-from decode-break-reason
2))
210 (unless (,mb-continuation-byte-p byte3
)
211 (return-from decode-break-reason
3))
212 (dpb byte
(byte 8 16) (dpb byte2
(byte 8 8) byte3
))))))
213 (ucs (,mb-to-ucs mb
)))
215 (return-from decode-break-reason
1)
219 (define-condition ,(make-od-name 'malformed format
)
220 (octet-decoding-error) ())
221 (define-condition ,(make-od-name-list 'invalid format
'starter-byte
)
222 (octet-decoding-error) ())
223 (define-condition ,(make-od-name-list 'invalid format
'continuation-byte
)
224 (octet-decoding-error) ())
226 (declaim (inline ,char-
>mb
))
227 (defun ,char-
>mb
(char dest string pos
)
228 (declare (optimize speed
(safety 0))
229 (type (array (unsigned-byte 8) (*)) dest
))
230 (let ((code (,ucs-to-mb
(char-code char
))))
233 (declare (type (unsigned-byte 8) b
))
234 (vector-push-extend b dest
)))
235 (declare (inline add-byte
))
236 (setf code
(the fixnum code
))
237 (ecase (mb-char-len code
)
241 (add-byte (ldb (byte 8 8) code
))
242 (add-byte (ldb (byte 8 0) code
)))
244 (add-byte (ldb (byte 8 16) code
))
245 (add-byte (ldb (byte 8 8) code
))
246 (add-byte (ldb (byte 8 0) code
)))))
247 (encoding-error ,format string pos
))))
249 (defun ,string-
>mb
(string sstart send additional-space
)
250 (declare (optimize speed
(safety 0))
251 (type simple-string string
)
252 (type array-range sstart send additional-space
))
253 (let ((array (make-array (+ additional-space
(- send sstart
))
254 :element-type
'(unsigned-byte 8)
257 (loop for i from sstart below send
258 do
(,char-
>mb
(char string i
) array string i
))
259 (dotimes (i additional-space
)
260 (vector-push-extend 0 array
))
261 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
263 (defmacro ,define-bytes-per-mb-character
(accessor type
)
264 (define-bytes-per-mb-character-1 accessor type
',format
265 ',mb-len
',mb-continuation-byte-p
))
267 (instantiate-octets-definition ,define-bytes-per-mb-character
)
269 (defmacro ,define-simple-get-mb-char
(accessor type
)
270 (define-simple-get-mb-char-1 accessor type
',format
',mb-to-ucs
))
272 (instantiate-octets-definition ,define-simple-get-mb-char
)
274 (defmacro ,define-mb-
>string
(accessor type
)
275 (define-mb->string-1 accessor type
',format
))
277 (instantiate-octets-definition ,define-mb-
>string
)
279 (add-external-format-funs ',aliases
280 '(,(make-od-name format
'>string-aref
)