3 # The basic unit for rendering Unicode is the code point.
4 # https://en.wikipedia.org/wiki/Code_point
5 # The glyph a non-cursive font displays may represent multiple code points.
7 # In addition to raw code points (just integers assigned special meaning), Mu
8 # provides a common encoding as a convenience: code-point-utf8.
10 fn test-unicode-serialization-and-deserialization {
11 var i/ebx: int <- copy 0
12 var init?/esi: boolean <- copy 1/true
14 compare i, 0x10000 # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
17 var c/eax: code-point <- copy i
18 var _g/eax: code-point-utf8 <- to-utf8 c
19 var g/ecx: code-point-utf8 <- copy _g
20 var c2/eax: code-point <- to-code-point g
25 compare init?, 0/false
27 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
30 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
31 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
33 var x/eax: int <- copy g
34 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
36 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
38 var x2/eax: int <- copy c2
39 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
41 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
43 i <- add 0xf # to speed things up; ensure increment is not a power of 2
48 # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox
49 fn to-code-point in: code-point-utf8 -> _/eax: code-point {
50 var g/ebx: int <- copy in
51 # if single byte, just return it
55 var result/eax: code-point <- copy g
59 var len/edx: int <- utf8-length in
60 # extract bits from first byte
61 var b/eax: byte <- copy-byte g
62 var result/edi: code-point <- copy b
78 # extract bits from remaining bytes
80 var i/ecx: int <- copy 1
84 var b/eax: byte <- copy-byte g
86 result <- shift-left 6
95 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
96 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
97 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
98 var c/eax: int <- copy in
99 var num-trailers/ecx: int <- copy 0
100 var first/edx: int <- copy 0
101 $to-utf8:compute-length: {
102 # single byte: just return it
106 var g/eax: code-point-utf8 <- copy c
113 num-trailers <- copy 1
115 break $to-utf8:compute-length
121 num-trailers <- copy 2
123 break $to-utf8:compute-length
129 num-trailers <- copy 3
131 break $to-utf8:compute-length
133 # more than 4 bytes: unsupported
137 abort "unsupported code point"
141 # emit trailer bytes, 6 bits from 'in', first two bits '10'
142 var result/edi: code-point-utf8 <- copy 0
144 compare num-trailers, 0
146 var tmp/esi: int <- copy c
149 result <- shift-left 8
153 num-trailers <- decrement
157 result <- shift-left 8
164 # single-byte code point have identical code-point-utf8s
165 fn test-to-utf8-single-byte {
166 var in-int/ecx: int <- copy 0
170 var in/eax: code-point <- copy in-int
171 var out/eax: code-point-utf8 <- to-utf8 in
172 var out-int/eax: int <- copy out
173 check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
179 # byte | byte | byte | byte
180 # smallest 2-byte utf-8
181 fn test-to-utf8-two-bytes-min {
182 var in/eax: code-point <- copy 0x80 # 10 00-0000
183 var out/eax: code-point-utf8 <- to-utf8 in
184 var out-int/eax: int <- copy out
185 check-ints-equal out-int, 0x80c2, "F - to-utf8/2a" # 110 0-0010 10 00-0000
188 # largest 2-byte utf-8
189 fn test-to-utf8-two-bytes-max {
190 var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111
191 var out/eax: code-point-utf8 <- to-utf8 in
192 var out-int/eax: int <- copy out
193 check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b" # 110 1-1111 10 11-1111
196 # smallest 3-byte utf-8
197 fn test-to-utf8-three-bytes-min {
198 var in/eax: code-point <- copy 0x800 # 10-0000 00-0000
199 var out/eax: code-point-utf8 <- to-utf8 in
200 var out-int/eax: int <- copy out
201 check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a" # 1110 0000 10 10-0000 10 00-0000
204 # largest 3-byte utf-8
205 fn test-to-utf8-three-bytes-max {
206 var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111
207 var out/eax: code-point-utf8 <- to-utf8 in
208 var out-int/eax: int <- copy out
209 check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b" # 1110 1111 10 11-1111 10 11-1111
212 # smallest 4-byte utf-8
213 fn test-to-utf8-four-bytes-min {
214 var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000
215 var out/eax: code-point-utf8 <- to-utf8 in
216 var out-int/eax: int <- copy out
217 check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000
220 # largest 4-byte utf-8
221 fn test-to-utf8-four-bytes-max {
222 var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111
223 var out/eax: code-point-utf8 <- to-utf8 in
224 var out-int/eax: int <- copy out
225 check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111
228 # read the next code-point-utf8 from a stream of bytes
229 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
230 # if at eof, return EOF
232 var eof?/eax: boolean <- stream-empty? in
233 compare eof?, 0/false
237 var c/eax: byte <- read-byte in
238 var num-trailers/ecx: int <- copy 0
239 $read-code-point-utf8:compute-length: {
240 # single byte: just return it
244 var g/eax: code-point-utf8 <- copy c
250 var g/eax: code-point-utf8 <- copy c
257 num-trailers <- copy 1
258 break $read-code-point-utf8:compute-length
264 num-trailers <- copy 2
265 break $read-code-point-utf8:compute-length
271 num-trailers <- copy 3
272 break $read-code-point-utf8:compute-length
274 abort "utf-8 encodings larger than 4 bytes are not yet supported"
277 # prepend trailer bytes
278 var result/edi: code-point-utf8 <- copy c
279 var num-byte-shifts/edx: int <- copy 1
281 compare num-trailers, 0
283 var tmp/eax: byte <- read-byte in
284 var tmp2/eax: int <- copy tmp
285 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
288 num-byte-shifts <- increment
289 num-trailers <- decrement
295 fn test-read-code-point-utf8 {
296 var s: (stream byte 0x30)
297 var s2/ecx: (addr stream byte) <- address s
299 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
300 var n/eax: int <- copy c
301 check-ints-equal n, 0x61, "F - test code-point-utf8/0"
302 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
303 var n/eax: int <- copy c
304 check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
305 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
306 var n/eax: int <- copy c
307 check-ints-equal n, 0x63, "F - test code-point-utf8/2"
308 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
309 var n/eax: int <- copy c
310 check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
311 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
312 var n/eax: int <- copy c
313 check-ints-equal n, 0x64, "F - test code-point-utf8/4"
314 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
315 var n/eax: int <- copy c
316 check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
317 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
318 var n/eax: int <- copy c
319 check-ints-equal n, 0x65, "F - test code-point-utf8/6"
322 fn utf8-length g: code-point-utf8 -> _/edx: int {
341 # needed because available primitives only shift by a literal/constant number of bits
342 fn shift-left-bytes n: int, k: int -> _/eax: int {
343 var i/ecx: int <- copy 0
344 var result/eax: int <- copy n
348 compare i, 4 # only 4 bytes in 32 bits
350 result <- shift-left 8
357 fn test-shift-left-bytes-0 {
358 var result/eax: int <- shift-left-bytes 1, 0
359 check-ints-equal result, 1, "F - shift-left-bytes 0"
362 fn test-shift-left-bytes-1 {
363 var result/eax: int <- shift-left-bytes 1, 1
364 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
367 fn test-shift-left-bytes-2 {
368 var result/eax: int <- shift-left-bytes 1, 2
369 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
372 fn test-shift-left-bytes-3 {
373 var result/eax: int <- shift-left-bytes 1, 3
374 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
377 fn test-shift-left-bytes-4 {
378 var result/eax: int <- shift-left-bytes 1, 4
379 check-ints-equal result, 0, "F - shift-left-bytes 4"
382 fn test-shift-left-bytes-5 {
383 var result/eax: int <- shift-left-bytes 1, 5
384 check-ints-equal result, 0, "F - shift-left-bytes >4"
387 # write a code-point-utf8 to a stream of bytes
388 # this is like write-to-stream, except we skip leading 0 bytes
389 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
390 $write-code-point-utf8:body: {
391 var c/eax: int <- copy g
392 append-byte out, c # first byte is always written
395 break-if-= $write-code-point-utf8:body
399 break-if-= $write-code-point-utf8:body
403 break-if-= $write-code-point-utf8:body