3 (defmacro while
(test &body body
)
8 (defmacro code-point-1-p
(a)
9 `(= (ldb (byte 1 7) ,a
) #b0
))
11 (defmacro code-point-2-p
(a b
)
12 `(and (= (ldb (byte 3 5) ,a
) #b110
)
13 (= (ldb (byte 2 6) ,b
) #b10
)))
15 (defmacro code-point-3-p
(a b c
)
16 `(and (= (ldb (byte 4 4) ,a
) #b1110
)
17 (= (ldb (byte 2 6) ,b
) #b10
)
18 (= (ldb (byte 2 6) ,c
) #b10
)))
20 (defmacro code-point-4-p
(a b c d
)
21 `(and (= (ldb (byte 5 3) ,a
) #b11110
)
22 (= (ldb (byte 2 6) ,b
) #b10
)
23 (= (ldb (byte 2 6) ,c
) #b10
)
24 (= (ldb (byte 2 6) ,d
) #b10
)))
26 (defmacro make-code-point-1
(a)
29 (defmacro make-code-point-2
(a b
)
30 `(logior (ash (ldb (byte 5 0) ,a
) 6)
33 (defmacro make-code-point-3
(a b c
)
34 `(logior (ash (ldb (byte 4 0) ,a
) (+ 6 6))
35 (ash (ldb (byte 6 0) ,b
) 6)
38 (defmacro make-code-point-4
(a b c d
)
39 `(logior (ash (ldb (byte 3 0) ,a
) (+ 6 6 6))
40 (ash (ldb (byte 6 0) ,b
) (+ 6 6))
41 (ash (ldb (byte 6 0) ,c
) 6)
44 (define-condition decode-error
(error)
45 ((seq :reader decode-error-seq
:initarg
:seq
:initform
(error ":seq required") :type sequence
)
46 (pos :reader decode-error-pos
:initarg
:pos
:initform
(error ":pos required") :type
'unsigned-byte
))
47 (:report
(lambda (condition stream
)
48 (format stream
"UTF-8 decode error at position ~D" (decode-error-pos condition
)))))
50 (defgeneric decode-sequence
(seq &key start end
))
52 (defmethod decode-sequence ((seq vector
) &key
(start 0) end
)
54 (setf end
(length seq
)))
55 (assert (<= 0 start end
(length seq
)))
56 (loop for pos from start to
(1- end
)
57 do
(assert (<= #x00
(aref seq pos
) #xFF
)))
59 (let ((code-point-list ())
61 (while (< (+ pos
3) end
)
62 (multiple-value-bind (code-point inc
)
63 (decode-code-point-4 (aref seq pos
) (aref seq
(1+ pos
)) (aref seq
(+ pos
2)) (aref seq
(+ pos
3)))
66 (error 'decode-error
:seq seq
:pos pos
)
68 :report
"Substitute with U+FFFD and continue"
69 (setf code-point
#xFFFD
)
70 (setf inc
(decode-error-width seq pos end
)))))
71 (push code-point code-point-list
)
74 (while (< (+ pos
2) end
)
75 (multiple-value-bind (code-point inc
)
76 (decode-code-point-3 (aref seq pos
) (aref seq
(1+ pos
)) (aref seq
(+ pos
2)))
79 (error 'decode-error
:seq seq
:pos pos
)
81 :report
"Substitute with U+FFFD and continue"
82 (setf code-point
#xFFFD
)
83 (setf inc
(decode-error-width seq pos end
)))))
84 (push code-point code-point-list
)
87 (while (< (1+ pos
) end
)
88 (multiple-value-bind (code-point inc
)
89 (decode-code-point-2 (aref seq pos
) (aref seq
(1+ pos
)))
92 (error 'decode-error
:seq seq
:pos pos
)
94 :report
"Substitute with U+FFFD and continue"
95 (setf code-point
#xFFFD
)
96 (setf inc
(decode-error-width seq pos end
)))))
97 (push code-point code-point-list
)
101 (multiple-value-bind (code-point inc
)
102 (decode-code-point-1 (aref seq pos
))
105 (error 'decode-error
:seq seq
:pos pos
)
107 :report
"Substitute with U+FFFD and continue"
108 (setf code-point
#xFFFD
)
109 (setf inc
(decode-error-width seq pos end
)))))
110 (push code-point code-point-list
)
113 (coerce (nreverse code-point-list
) 'vector
)))
115 (defun decode-code-point-1 (a)
116 (declare (type (unsigned-byte 8) a
))
117 (cond ((code-point-1-p a
)
118 (values (make-code-point-1 a
) 1))))
120 (defun decode-code-point-2 (a b
)
121 (declare (type (unsigned-byte 8) a
))
122 (declare (type (unsigned-byte 8) b
))
123 (cond ((code-point-1-p a
)
124 (values (make-code-point-1 a
) 1))
126 ((code-point-2-p a b
)
127 (let ((code-point (make-code-point-2 a b
)))
128 (if (<= #x0080 code-point
)
129 (values code-point
2)
132 (defun decode-code-point-3 (a b c
)
133 (declare (type (unsigned-byte 8) a
))
134 (declare (type (unsigned-byte 8) b
))
135 (declare (type (unsigned-byte 8) c
))
136 (cond ((code-point-1-p a
)
137 (values (make-code-point-1 a
) 1))
139 ((code-point-2-p a b
)
140 (let ((code-point (make-code-point-2 a b
)))
141 (if (<= #x0080 code-point
)
142 (values code-point
2)
145 ((code-point-3-p a b c
)
146 (let ((code-point (make-code-point-3 a b c
)))
147 (if (or (<= #x0800 code-point
#xD7FF
)
148 (<= #xE000 code-point
))
149 (values code-point
3)
152 (defun decode-code-point-4 (a b c d
)
153 (declare (type (unsigned-byte 8) a
))
154 (declare (type (unsigned-byte 8) b
))
155 (declare (type (unsigned-byte 8) c
))
156 (declare (type (unsigned-byte 8) d
))
157 (cond ((code-point-1-p a
)
158 (values (make-code-point-1 a
) 1))
160 ((code-point-2-p a b
)
161 (let ((code-point (make-code-point-2 a b
)))
162 (if (<= #x0080 code-point
)
163 (values code-point
2)
166 ((code-point-3-p a b c
)
167 (let ((code-point (make-code-point-3 a b c
)))
168 (if (or (<= #x0800 code-point
#xD7FF
)
169 (<= #xE000 code-point
))
170 (values code-point
3)
173 ((code-point-4-p a b c d
)
174 (let ((code-point (make-code-point-4 a b c d
)))
175 (if (<= #x10000 code-point
#x10FFFF
)
176 (values code-point
4)
179 (defun decode-error-width (seq pos end
)
180 (let ((a (aref seq pos
))
181 (b (if (< (1+ pos
) end
)
184 (c (if (< (+ pos
2) end
)
187 (cond ((and (= a
#xE0
) b
(<= #xA0 b
#xBF
)) 2)
188 ((and (<= #xE1 a
#xEC
) b
(<= #x80 b
#xBF
)) 2)
189 ((and (= a
#xED
) b
(<= #x80 b
#x9F
)) 2)
190 ((and (<= #xEE a
#xEF
) b
(<= #x80 b
#xBF
)) 2)
191 ((and (= a
#xF0
) b
(<= #x90 b
#xBF
) c
(<= #x80 c
#xBF
)) 3)
192 ((and (= a
#xF0
) b
(<= #x90 b
#xBF
)) 2)
193 ((and (<= #xF1 a
#xF3
) b
(<= #x80 b
#xBF
) c
(<= #x80 c
#xBF
)) 3)
194 ((and (<= #xF1 a
#xF3
) b
(<= #x80 b
#xBF
)) 2)
195 ((and (= a
#xF4
) b
(<= #x80 b
#x8F
) c
(<= #x80 c
#xBF
)) 3)
196 ((and (= a
#xF4
) b
(<= #x80 b
#x8F
)) 2)