3 # Mu has no characters, only code points and graphemes.
4 # Code points are the indivisible atoms of text streams.
5 # https://en.wikipedia.org/wiki/Code_point
6 # Graphemes are the smallest self-contained unit of text.
7 # Graphemes may consist of multiple code points.
9 # Mu graphemes are always represented in utf-8, and they are required to fit
12 # Mu doesn't currently support combining code points, or graphemes made of
13 # multiple code points. One day we will.
14 # We also don't currently support code points that translate into multiple
15 # or wide graphemes. (In particular, Tab will never be supported.)
17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
20 # The day we want to support combining characters, this function will need to
21 # take multiple code points. Or something.
22 fn to-grapheme in: code-point -> _/eax: grapheme {
23 var c/eax: int <- copy in
24 var num-trailers/ecx: int <- copy 0
25 var first/edx: int <- copy 0
26 $to-grapheme:compute-length: {
27 # single byte: just return it
31 var g/eax: grapheme <- copy c
38 num-trailers <- copy 1
40 break $to-grapheme:compute-length
46 num-trailers <- copy 2
48 break $to-grapheme:compute-length
54 num-trailers <- copy 3
56 break $to-grapheme:compute-length
58 # more than 4 bytes: unsupported
59 # TODO: print error message to stderr
66 # emit trailer bytes, 6 bits from 'in', first two bits '10'
67 var result/edi: grapheme <- copy 0
69 compare num-trailers, 0
71 var tmp/esi: int <- copy c
74 result <- shift-left 8
78 num-trailers <- decrement
82 result <- shift-left 8
89 # TODO: bring in tests once we have check-ints-equal
91 # read the next grapheme from a stream of bytes
92 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
93 # if at eof, return EOF
95 var eof?/eax: boolean <- stream-empty? in
100 var c/eax: byte <- read-byte in
101 var num-trailers/ecx: int <- copy 0
102 $read-grapheme:compute-length: {
103 # single byte: just return it
107 var g/eax: grapheme <- copy c
113 var g/eax: grapheme <- copy c
120 num-trailers <- copy 1
121 break $read-grapheme:compute-length
127 num-trailers <- copy 2
128 break $read-grapheme:compute-length
134 num-trailers <- copy 3
135 break $read-grapheme:compute-length
137 # TODO: print error message
140 # prepend trailer bytes
141 var result/edi: grapheme <- copy c
142 var num-byte-shifts/edx: int <- copy 1
144 compare num-trailers, 0
146 var tmp/eax: byte <- read-byte in
147 var tmp2/eax: int <- copy tmp
148 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
151 num-byte-shifts <- increment
152 num-trailers <- decrement
158 # needed because available primitives only shift by a literal/constant number of bits
159 fn shift-left-bytes n: int, k: int -> _/eax: int {
160 var i/ecx: int <- copy 0
161 var result/eax: int <- copy n
165 compare i, 4 # only 4 bytes in 32 bits
167 result <- shift-left 8
174 # write a grapheme to a stream of bytes
175 # this is like write-to-stream, except we skip leading 0 bytes
176 fn write-grapheme out: (addr stream byte), g: grapheme {
177 $write-grapheme:body: {
178 var c/eax: int <- copy g
179 append-byte out, c # first byte is always written
182 break-if-= $write-grapheme:body
186 break-if-= $write-grapheme:body
190 break-if-= $write-grapheme:body