3 # Mu has no characters, only code points and graphemes.
4 # Code points are the indivisible atoms of text streams.
5 # https://en.wikipedia.org/wiki/Code_point
6 # Graphemes are the smallest self-contained unit of text.
7 # Graphemes may consist of multiple code points.
9 # Mu graphemes are always represented in utf-8, and they are required to fit
12 # Mu doesn't currently support combining code points, or graphemes made of
13 # multiple code points. One day we will.
14 # On Linux, we also don't currently support code points that translate into
15 # multiple or wide graphemes. (In particular, Tab will never be supported.)
17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
19 fn to-grapheme in: code-point -> _/eax: grapheme {
20 var c/eax: int <- copy in
21 var num-trailers/ecx: int <- copy 0
22 var first/edx: int <- copy 0
23 $to-grapheme:compute-length: {
24 # single byte: just return it
28 var g/eax: grapheme <- copy c
35 num-trailers <- copy 1
37 break $to-grapheme:compute-length
43 num-trailers <- copy 2
45 break $to-grapheme:compute-length
51 num-trailers <- copy 3
53 break $to-grapheme:compute-length
55 # more than 4 bytes: unsupported
56 # TODO: print to stderr
60 print-string-to-real-screen "unsupported code point "
61 print-int32-hex-to-real-screen c
62 print-string-to-real-screen "\n"
63 var exit-status/ebx: int <- copy 1
67 # emit trailer bytes, 6 bits from 'in', first two bits '10'
68 var result/edi: grapheme <- copy 0
70 compare num-trailers, 0
72 var tmp/esi: int <- copy c
75 result <- shift-left 8
79 num-trailers <- decrement
83 result <- shift-left 8
90 # single-byte code point have identical graphemes
91 fn test-to-grapheme-single-byte {
92 var in-int/ecx: int <- copy 0
96 var in/eax: code-point <- copy in-int
97 var out/eax: grapheme <- to-grapheme in
98 var out-int/eax: int <- copy out
99 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
105 # byte | byte | byte | byte
106 # smallest 2-byte utf-8
107 fn test-to-grapheme-two-bytes-min {
108 var in/eax: code-point <- copy 0x80 # 10 00-0000
109 var out/eax: grapheme <- to-grapheme in
110 var out-int/eax: int <- copy out
111 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a" # 110 0-0010 10 00-0000
114 # largest 2-byte utf-8
115 fn test-to-grapheme-two-bytes-max {
116 var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111
117 var out/eax: grapheme <- to-grapheme in
118 var out-int/eax: int <- copy out
119 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b" # 110 1-1111 10 11-1111
122 # smallest 3-byte utf-8
123 fn test-to-grapheme-three-bytes-min {
124 var in/eax: code-point <- copy 0x800 # 10-0000 00-0000
125 var out/eax: grapheme <- to-grapheme in
126 var out-int/eax: int <- copy out
127 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a" # 1110 0000 10 10-0000 10 00-0000
130 # largest 3-byte utf-8
131 fn test-to-grapheme-three-bytes-max {
132 var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111
133 var out/eax: grapheme <- to-grapheme in
134 var out-int/eax: int <- copy out
135 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b" # 1110 1111 10 11-1111 10 11-1111
138 # smallest 4-byte utf-8
139 fn test-to-grapheme-four-bytes-min {
140 var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000
141 var out/eax: grapheme <- to-grapheme in
142 var out-int/eax: int <- copy out
143 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000
146 # largest 4-byte utf-8
147 fn test-to-grapheme-four-bytes-max {
148 var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111
149 var out/eax: grapheme <- to-grapheme in
150 var out-int/eax: int <- copy out
151 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111
154 # read the next grapheme from a stream of bytes
155 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
156 # if at eof, return EOF
158 var eof?/eax: boolean <- stream-empty? in
159 compare eof?, 0/false
163 var c/eax: byte <- read-byte in
164 var num-trailers/ecx: int <- copy 0
165 $read-grapheme:compute-length: {
166 # single byte: just return it
170 var g/eax: grapheme <- copy c
176 var g/eax: grapheme <- copy c
183 num-trailers <- copy 1
184 break $read-grapheme:compute-length
190 num-trailers <- copy 2
191 break $read-grapheme:compute-length
197 num-trailers <- copy 3
198 break $read-grapheme:compute-length
200 $read-grapheme:abort: {
201 # TODO: print to stderr
202 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
203 var n/eax: int <- copy c
204 print-int32-hex-to-real-screen n
205 print-string-to-real-screen "\n"
206 var exit-status/ebx: int <- copy 1
210 # prepend trailer bytes
211 var result/edi: grapheme <- copy c
212 var num-byte-shifts/edx: int <- copy 1
214 compare num-trailers, 0
216 var tmp/eax: byte <- read-byte in
217 var tmp2/eax: int <- copy tmp
218 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
221 num-byte-shifts <- increment
222 num-trailers <- decrement
228 fn test-read-grapheme {
229 var s: (stream byte 0x30)
230 var s2/ecx: (addr stream byte) <- address s
232 var c/eax: grapheme <- read-grapheme s2
233 var n/eax: int <- copy c
234 check-ints-equal n, 0x61, "F - test grapheme/0"
235 var c/eax: grapheme <- read-grapheme s2
236 var n/eax: int <- copy c
237 check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test grapheme/1"
238 var c/eax: grapheme <- read-grapheme s2
239 var n/eax: int <- copy c
240 check-ints-equal n, 0x63, "F - test grapheme/2"
241 var c/eax: grapheme <- read-grapheme s2
242 var n/eax: int <- copy c
243 check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
244 var c/eax: grapheme <- read-grapheme s2
245 var n/eax: int <- copy c
246 check-ints-equal n, 0x64, "F - test grapheme/4"
247 var c/eax: grapheme <- read-grapheme s2
248 var n/eax: int <- copy c
249 check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
250 var c/eax: grapheme <- read-grapheme s2
251 var n/eax: int <- copy c
252 check-ints-equal n, 0x65, "F - test grapheme/6"
255 fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme {
256 var c/eax: byte <- read-byte-buffered in
257 var num-trailers/ecx: int <- copy 0
258 $read-grapheme-buffered:compute-length: {
259 # single byte: just return it
263 var g/eax: grapheme <- copy c
269 var g/eax: grapheme <- copy c
276 num-trailers <- copy 1
277 break $read-grapheme-buffered:compute-length
283 num-trailers <- copy 2
284 break $read-grapheme-buffered:compute-length
290 num-trailers <- copy 3
291 break $read-grapheme-buffered:compute-length
293 $read-grapheme-buffered:abort: {
294 # TODO: print to stderr
295 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
296 var n/eax: int <- copy c
297 print-int32-hex-to-real-screen n
298 print-string-to-real-screen "\n"
299 var exit-status/ebx: int <- copy 1
303 # prepend trailer bytes
304 var result/edi: grapheme <- copy c
305 var num-byte-shifts/edx: int <- copy 1
307 compare num-trailers, 0
309 var tmp/eax: byte <- read-byte-buffered in
310 var tmp2/eax: int <- copy tmp
311 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
314 num-byte-shifts <- increment
315 num-trailers <- decrement
321 # needed because available primitives only shift by a literal/constant number of bits
322 fn shift-left-bytes n: int, k: int -> _/eax: int {
323 var i/ecx: int <- copy 0
324 var result/eax: int <- copy n
328 compare i, 4 # only 4 bytes in 32 bits
330 result <- shift-left 8
337 fn test-shift-left-bytes-0 {
338 var result/eax: int <- shift-left-bytes 1, 0
339 check-ints-equal result, 1, "F - shift-left-bytes 0"
342 fn test-shift-left-bytes-1 {
343 var result/eax: int <- shift-left-bytes 1, 1
344 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
347 fn test-shift-left-bytes-2 {
348 var result/eax: int <- shift-left-bytes 1, 2
349 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
352 fn test-shift-left-bytes-3 {
353 var result/eax: int <- shift-left-bytes 1, 3
354 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
357 fn test-shift-left-bytes-4 {
358 var result/eax: int <- shift-left-bytes 1, 4
359 check-ints-equal result, 0, "F - shift-left-bytes 4"
362 fn test-shift-left-bytes-5 {
363 var result/eax: int <- shift-left-bytes 1, 5
364 check-ints-equal result, 0, "F - shift-left-bytes >4"
367 # write a grapheme to a stream of bytes
368 # this is like write-to-stream, except we skip leading 0 bytes
369 fn write-grapheme out: (addr stream byte), g: grapheme {
370 $write-grapheme:body: {
371 var c/eax: int <- copy g
372 append-byte out, c # first byte is always written
375 break-if-= $write-grapheme:body
379 break-if-= $write-grapheme:body
383 break-if-= $write-grapheme:body