1 (in-package :runes-encoding
)
3 (define-condition encoding-error
(simple-error) ())
5 (defun xerror (fmt &rest args
)
6 (error 'encoding-error
:format-control fmt
:format-arguments args
))
8 ;;;; ---------------------------------------------------------------------------
12 (defvar *names
* (make-hash-table :test
#'eq
))
14 (defun canon-name (string)
15 (with-output-to-string (bag)
17 (cond ((char= ch
#\_
) (write-char #\- bag
))
18 (t (write-char (char-upcase ch
) bag
))))
21 (defun canon-name-2 (string)
22 (with-output-to-string (bag)
24 (cond ((char= ch
#\_
))
26 (t (write-char (char-upcase ch
) bag
))))
29 (defmethod encoding-names ((encoding symbol
))
30 (gethash encoding
*names
*))
32 (defmethod (setf encoding-names
) (new-value (encoding symbol
))
33 (setf (gethash encoding
*names
*) new-value
))
35 (defun add-name (encoding name
)
36 (pushnew (canon-name name
) (encoding-names encoding
) :test
#'string
=))
38 (defun resolve-name (string)
39 (cond ((symbolp string
)
42 (setq string
(canon-name string
))
45 (maphash (lambda (x y
)
46 (when (member string y
:test
#'string
=)
51 (maphash (lambda (x y
)
52 (when (member string y
54 (string= (canon-name-2 x
)
60 ;;;; ---------------------------------------------------------------------------
64 (defvar *encodings
* (make-hash-table :test
#'eq
))
66 (defmacro define-encoding
(name init-form
)
68 (setf (gethash ',name
*encodings
*)
69 (list nil
(lambda () ,init-form
)))
72 (defun find-encoding (name)
73 (let ((x (gethash (resolve-name name
) *encodings
*)))
76 (setf (first x
) (funcall (second x
)))))))
78 (defclass encoding
() ())
80 (defclass simple-8-bit-encoding
(encoding)
81 ((table :initarg
:table
)))
83 (defun make-simple-8-bit-encoding (&key charset
)
84 (make-instance 'simple-8-bit-encoding
85 :table
(coerce (to-unicode-table charset
) '(simple-array (unsigned-byte 16) (256)))))
89 (defmacro fx-op
(op &rest xs
)
90 `(the fixnum
(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
))))
91 (defmacro fx-pred
(op &rest xs
)
92 `(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
)))
94 (defmacro %
+ (&rest xs
) `(fx-op + ,@xs
))
95 (defmacro %-
(&rest xs
) `(fx-op -
,@xs
))
96 (defmacro %
* (&rest xs
) `(fx-op * ,@xs
))
97 (defmacro %
/ (&rest xs
) `(fx-op floor
,@xs
))
98 (defmacro %and
(&rest xs
) `(fx-op logand
,@xs
))
99 (defmacro %ior
(&rest xs
) `(fx-op logior
,@xs
))
100 (defmacro %xor
(&rest xs
) `(fx-op logxor
,@xs
))
101 (defmacro %ash
(&rest xs
) `(fx-op ash
,@xs
))
102 (defmacro %mod
(&rest xs
) `(fx-op mod
,@xs
))
104 (defmacro %
= (&rest xs
) `(fx-pred = ,@xs
))
105 (defmacro %
<= (&rest xs
) `(fx-pred <= ,@xs
))
106 (defmacro %
>= (&rest xs
) `(fx-pred >= ,@xs
))
107 (defmacro %
< (&rest xs
) `(fx-pred < ,@xs
))
108 (defmacro %
> (&rest xs
) `(fx-pred > ,@xs
))
112 ;; The decoders share a common signature:
114 ;; DECODE input input-start input-end
115 ;; output output-start output-end
117 ;; -> first-not-written ; first-not-read
119 ;; These decode functions should decode as much characters off `input'
120 ;; into the `output' as possible and return the indexes to the first
121 ;; not read and first not written element of `input' and `output'
122 ;; respectively. If there are not enough bytes in `input' to decode a
123 ;; full character, decoding shold be abandomed; the caller has to
124 ;; ensure that the remaining bytes of `input' are passed to the
125 ;; decoder again with more bytes appended.
127 ;; `eof-p' now in turn indicates, if the given input sequence, is all
128 ;; the producer does have and might be used to produce error messages
129 ;; in case of incomplete codes or decided what to do.
131 ;; Decoders are expected to handle the various CR/NL conventions and
132 ;; canonicalize each end of line into a single NL rune (#xA) in good
133 ;; old Lisp tradition.
136 ;; TODO: change this to an encoding class, which then might carry
137 ;; additional state. Stateless encodings could been represented by
140 ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...)
143 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian
))
144 in in-start in-end out out-start out-end eof?
)
145 ;; -> new wptr, new rptr
146 (let ((wptr out-start
)
149 (when (%
= wptr out-end
)
151 (when (>= (%
+ rptr
1) in-end
)
153 (let ((hi (aref in rptr
))
154 (lo (aref in
(%
+ 1 rptr
))))
155 (setf rptr
(%
+ 2 rptr
))
156 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
157 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
159 (let ((x (logior (ash hi
8) lo
)))
160 (when (or (eql x
#xFFFE
) (eql x
#xFFFF
))
161 (xerror "not a valid code point: #x~X" x
))
162 (setf (aref out wptr
) x
))
163 (setf wptr
(%
+ 1 wptr
))))
166 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian
))
167 in in-start in-end out out-start out-end eof?
)
168 ;; -> new wptr, new rptr
169 (let ((wptr out-start
)
172 (when (%
= wptr out-end
)
174 (when (>= (%
+ rptr
1) in-end
)
176 (let ((lo (aref in
(%
+ 0 rptr
)))
177 (hi (aref in
(%
+ 1 rptr
))))
178 (setf rptr
(%
+ 2 rptr
))
179 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
180 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
182 (let ((x (logior (ash hi
8) lo
)))
183 (when (or (eql x
#xFFFE
) (eql x
#xFFFF
))
184 (xerror "not a valid code point: #x~X" x
))
185 (setf (aref out wptr
) x
))
186 (setf wptr
(%
+ 1 wptr
))))
189 (defmethod decode-sequence ((encoding (eql :utf-8
))
190 in in-start in-end out out-start out-end eof?
)
191 (declare (optimize (speed 3) (safety 0))
192 (type (simple-array (unsigned-byte 8) (*)) in
)
193 (type (simple-array (unsigned-byte 16) (*)) out
)
194 (type fixnum in-start in-end out-start out-end
))
195 (let ((wptr out-start
)
200 (when (or (<= #xD800 x
#xDBFF
)
201 (<= #xDC00 x
#xDFFF
))
202 (xerror "surrogate encoded in UTF-8: #x~X." x
))
203 (cond ((or (%
> x
#x10FFFF
)
206 (xerror "not a valid code point: #x~X" x
))
208 (setf (aref out
(%
+ 0 wptr
)) (%
+ #xD7C0
(ash x -
10))
209 (aref out
(%
+ 1 wptr
)) (%ior
#xDC00
(%and x
#x3FF
)))
210 (setf wptr
(%
+ wptr
2)))
212 (setf (aref out wptr
) x
)
213 (setf wptr
(%
+ wptr
1)))))
217 (setf (aref out wptr
) ,x
)
218 (setf wptr
(%
+ wptr
1)))))
220 (when (%
= (+ wptr
1) out-end
) (return))
221 (when (%
>= rptr in-end
) (return))
222 (setq byte0
(aref in rptr
))
223 (cond ((= byte0
#x0D
)
225 ;; we need to know the following character
226 (cond ((>= (%
+ rptr
1) in-end
)
227 ;; no characters in buffer
229 ;; at EOF, pass it as NL
231 (setf rptr
(%
+ rptr
1)))
233 ;; demand more characters
235 ((= (aref in
(%
+ rptr
1)) #x0A
)
236 ;; we see CR NL, so forget this CR and the next NL will be
237 ;; inserted literally
238 (setf rptr
(%
+ rptr
1)))
240 ;; singleton CR, pass it as NL
242 (setf rptr
(%
+ rptr
1)))))
244 ((%
<= #|
#b00000000|
# byte0
#b01111111
)
246 (setf rptr
(%
+ rptr
1)))
248 ((%
<= #|
#b10000000|
# byte0
#b10111111
)
249 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)
250 (setf rptr
(%
+ rptr
1)))
252 ((%
<= #|
#b11000000|
# byte0
#b11011111
)
253 (cond ((< (%
+ rptr
2) in-end
)
255 (dpb (ldb (byte 5 0) byte0
) (byte 5 6)
256 (dpb (ldb (byte 6 0) (aref in
(%
+ rptr
1))) (byte 6 0)
258 (setf rptr
(%
+ rptr
2)))
262 ((%
<= #|
#b11100000|
# byte0
#b11101111
)
263 (cond ((< (%
+ rptr
3) in-end
)
265 (dpb (ldb (byte 4 0) byte0
) (byte 4 12)
266 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 6)
267 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 0)
269 (setf rptr
(%
+ rptr
3)))
273 ((%
<= #|
#b11110000|
# byte0
#b11110111
)
274 (cond ((< (%
+ rptr
4) in-end
)
276 (dpb (ldb (byte 3 0) byte0
) (byte 3 18)
277 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 12)
278 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 6)
279 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 0)
281 (setf rptr
(%
+ rptr
4)))
285 ((%
<= #|
#b11111000|
# byte0
#b11111011
)
286 (cond ((< (%
+ rptr
5) in-end
)
288 (dpb (ldb (byte 2 0) byte0
) (byte 2 24)
289 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 18)
290 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 12)
291 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 6)
292 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 0)
294 (setf rptr
(%
+ rptr
5)))
298 ((%
<= #|
#b11111100|
# byte0
#b11111101
)
299 (cond ((< (%
+ rptr
6) in-end
)
301 (dpb (ldb (byte 1 0) byte0
) (byte 1 30)
302 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 24)
303 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 18)
304 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 12)
305 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 6)
306 (dpb (ldb (byte 6 0) (aref in
(%
+ 5 rptr
))) (byte 6 0)
308 (setf rptr
(%
+ rptr
6)))
313 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)) ) ))
314 (values wptr rptr
)) )
316 (defmethod encoding-p ((object (eql :utf-16-little-endian
))) t
)
317 (defmethod encoding-p ((object (eql :utf-16-big-endian
))) t
)
318 (defmethod encoding-p ((object (eql :utf-8
))) t
)
320 (defmethod encoding-p ((object encoding
)) t
)
322 (defmethod decode-sequence ((encoding simple-8-bit-encoding
)
324 out out-start out-end
326 (declare (optimize (speed 3) (safety 0))
327 (type (simple-array (unsigned-byte 8) (*)) in
)
328 (type (simple-array (unsigned-byte 16) (*)) out
)
329 (type fixnum in-start in-end out-start out-end
))
330 (let ((wptr out-start
)
333 (table (slot-value encoding
'table
)))
334 (declare (type fixnum wptr rptr
)
335 (type (unsigned-byte 8) byte
)
336 (type (simple-array (unsigned-byte 16) (*)) table
))
338 (when (%
= wptr out-end
) (return))
339 (when (%
>= rptr in-end
) (return))
340 (setq byte
(aref in rptr
))
343 ;; we need to know the following character
344 (cond ((>= (%
+ rptr
1) in-end
)
345 ;; no characters in buffer
347 ;; at EOF, pass it as NL
348 (setf (aref out wptr
) #x0A
)
349 (setf wptr
(%
+ wptr
1))
350 (setf rptr
(%
+ rptr
1)))
352 ;; demand more characters
354 ((= (aref in
(%
+ rptr
1)) #x0A
)
355 ;; we see CR NL, so forget this CR and the next NL will be
356 ;; inserted literally
357 (setf rptr
(%
+ rptr
1)))
359 ;; singleton CR, pass it as NL
360 (setf (aref out wptr
) #x0A
)
361 (setf wptr
(%
+ wptr
1))
362 (setf rptr
(%
+ rptr
1)))))
365 (setf (aref out wptr
) (aref table byte
))
366 (setf wptr
(%
+ wptr
1))
367 (setf rptr
(%
+ rptr
1))) ))
370 ;;;; ---------------------------------------------------------------------------
374 (defvar *charsets
* (make-hash-table :test
#'eq
))
376 (defclass 8-bit-charset
()
377 ((name :initarg
:name
)
379 :initarg
:to-unicode-table
380 :reader to-unicode-table
)))
382 (defmacro define-8-bit-charset
(name &rest codes
)
383 (assert (= 256 (length codes
)))
385 (setf (gethash ',name
*charsets
*)
386 (make-instance '8-bit-charset
390 :element-type
'(unsigned-byte 16)
391 :initial-contents codes
)))
394 (defun find-charset (name)
395 (or (gethash name
*charsets
*)
396 (xerror "There is no character set named ~S." name
)))