403unicode.mu

   1 # Helpers for Unicode.
   2 #
   3 # The basic unit for rendering Unicode is the code point.
   4 #   https://en.wikipedia.org/wiki/Code_point
   5 # The glyph a non-cursive font displays may represent multiple code points.
   6 #
   7 # In addition to raw code points (just integers assigned special meaning), Mu
   8 # provides a common encoding as a convenience: code-point-utf8.
   9
  10 fn test-unicode-serialization-and-deserialization {
  11   var i/ebx: int <- copy 0
  12   var init?/esi: boolean <- copy 1/true
  13   {
  14     compare i, 0x10000  # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
  15                         # but not emoji
  16     break-if->=
  17     var c/eax: code-point <- copy i
  18     var _g/eax: code-point-utf8 <- to-utf8 c
  19     var g/ecx: code-point-utf8 <- copy _g
  20     var c2/eax: code-point <- to-code-point g
  21     compare i, c2
  22     {
  23       break-if-=
  24       {
  25         compare init?, 0/false
  26         break-if-=
  27         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
  28       }
  29       init? <- copy 0/false
  30       draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
  31       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
  32       {
  33         var x/eax: int <- copy g
  34         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
  35       }
  36       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
  37       {
  38         var x2/eax: int <- copy c2
  39         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
  40       }
  41       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
  42     }
  43     i <- add 0xf  # to speed things up; ensure increment is not a power of 2
  44     loop
  45   }
  46 }
  47
  48 # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox
  49 fn to-code-point in: code-point-utf8 -> _/eax: code-point {
  50   var g/ebx: int <- copy in
  51   # if single byte, just return it
  52   {
  53     compare g, 0xff
  54     break-if->
  55     var result/eax: code-point <- copy g
  56     return result
  57   }
  58   #
  59   var len/edx: int <- utf8-length in
  60   # extract bits from first byte
  61   var b/eax: byte <- copy-byte g
  62   var result/edi: code-point <- copy b
  63   {
  64     compare len, 2
  65     break-if-!=
  66     result <- and 0x1f
  67   }
  68   {
  69     compare len, 3
  70     break-if-!=
  71     result <- and 0x0f
  72   }
  73   {
  74     compare len, 4
  75     break-if-!=
  76     result <- and 0x07
  77   }
  78   # extract bits from remaining bytes
  79   g <- shift-right 8
  80   var i/ecx: int <- copy 1
  81   {
  82     compare i, len
  83     break-if->=
  84     var b/eax: byte <- copy-byte g
  85     b <- and 0x3f
  86     result <- shift-left 6
  87     result <- or b
  88     g <- shift-right 8
  89     i <- increment
  90     loop
  91   }
  92   return result
  93 }
  94
  95 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
  96 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
  97 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
  98   var c/eax: int <- copy in
  99   var num-trailers/ecx: int <- copy 0
 100   var first/edx: int <- copy 0
 101   $to-utf8:compute-length: {
 102     # single byte: just return it
 103     compare c, 0x7f
 104     {
 105       break-if->
 106       var g/eax: code-point-utf8 <- copy c
 107       return g
 108     }
 109     # 2 bytes
 110     compare c, 0x7ff
 111     {
 112       break-if->
 113       num-trailers <- copy 1
 114       first <- copy 0xc0
 115       break $to-utf8:compute-length
 116     }
 117     # 3 bytes
 118     compare c, 0xffff
 119     {
 120       break-if->
 121       num-trailers <- copy 2
 122       first <- copy 0xe0
 123       break $to-utf8:compute-length
 124     }
 125     # 4 bytes
 126     compare c, 0x1fffff
 127     {
 128       break-if->
 129       num-trailers <- copy 3
 130       first <- copy 0xf0
 131       break $to-utf8:compute-length
 132     }
 133     # more than 4 bytes: unsupported
 134     compare c, 0x1fffff
 135     {
 136       break-if->
 137       abort "unsupported code point"
 138       return 0
 139     }
 140   }
 141   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 142   var result/edi: code-point-utf8 <- copy 0
 143   {
 144     compare num-trailers, 0
 145     break-if-<=
 146     var tmp/esi: int <- copy c
 147     tmp <- and 0x3f
 148     tmp <- or 0x80
 149     result <- shift-left 8
 150     result <- or tmp
 151     # update loop state
 152     c <- shift-right 6
 153     num-trailers <- decrement
 154     loop
 155   }
 156   # emit engine
 157   result <- shift-left 8
 158   result <- or c
 159   result <- or first
 160   #
 161   return result
 162 }
 163
 164 # single-byte code point have identical code-point-utf8s
 165 fn test-to-utf8-single-byte {
 166   var in-int/ecx: int <- copy 0
 167   {
 168     compare in-int, 0x7f
 169     break-if->
 170     var in/eax: code-point <- copy in-int
 171     var out/eax: code-point-utf8 <- to-utf8 in
 172     var out-int/eax: int <- copy out
 173     check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
 174     in-int <- increment
 175     loop
 176   }
 177 }
 178
 179                                                               # byte       | byte      | byte      | byte
 180 # smallest 2-byte utf-8
 181 fn test-to-utf8-two-bytes-min {
 182   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
 183   var out/eax: code-point-utf8 <- to-utf8 in
 184   var out-int/eax: int <- copy out
 185   check-ints-equal out-int, 0x80c2, "F - to-utf8/2a"      #                         110 0-0010  10 00-0000
 186 }
 187
 188 # largest 2-byte utf-8
 189 fn test-to-utf8-two-bytes-max {
 190   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
 191   var out/eax: code-point-utf8 <- to-utf8 in
 192   var out-int/eax: int <- copy out
 193   check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b"      #                         110 1-1111  10 11-1111
 194 }
 195
 196 # smallest 3-byte utf-8
 197 fn test-to-utf8-three-bytes-min {
 198   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
 199   var out/eax: code-point-utf8 <- to-utf8 in
 200   var out-int/eax: int <- copy out
 201   check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a"    #              1110 0000  10 10-0000  10 00-0000
 202 }
 203
 204 # largest 3-byte utf-8
 205 fn test-to-utf8-three-bytes-max {
 206   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
 207   var out/eax: code-point-utf8 <- to-utf8 in
 208   var out-int/eax: int <- copy out
 209   check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b"    #              1110 1111  10 11-1111  10 11-1111
 210 }
 211
 212 # smallest 4-byte utf-8
 213 fn test-to-utf8-four-bytes-min {
 214   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
 215   var out/eax: code-point-utf8 <- to-utf8 in
 216   var out-int/eax: int <- copy out
 217   check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
 218 }
 219
 220 # largest 4-byte utf-8
 221 fn test-to-utf8-four-bytes-max {
 222   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
 223   var out/eax: code-point-utf8 <- to-utf8 in
 224   var out-int/eax: int <- copy out
 225   check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
 226 }
 227
 228 # read the next code-point-utf8 from a stream of bytes
 229 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
 230   # if at eof, return EOF
 231   {
 232     var eof?/eax: boolean <- stream-empty? in
 233     compare eof?, 0/false
 234     break-if-=
 235     return 0xffffffff
 236   }
 237   var c/eax: byte <- read-byte in
 238   var num-trailers/ecx: int <- copy 0
 239   $read-code-point-utf8:compute-length: {
 240     # single byte: just return it
 241     compare c, 0xc0
 242     {
 243       break-if->=
 244       var g/eax: code-point-utf8 <- copy c
 245       return g
 246     }
 247     compare c, 0xfe
 248     {
 249       break-if-<
 250       var g/eax: code-point-utf8 <- copy c
 251       return g
 252     }
 253     # 2 bytes
 254     compare c, 0xe0
 255     {
 256       break-if->=
 257       num-trailers <- copy 1
 258       break $read-code-point-utf8:compute-length
 259     }
 260     # 3 bytes
 261     compare c, 0xf0
 262     {
 263       break-if->=
 264       num-trailers <- copy 2
 265       break $read-code-point-utf8:compute-length
 266     }
 267     # 4 bytes
 268     compare c, 0xf8
 269     {
 270       break-if->=
 271       num-trailers <- copy 3
 272       break $read-code-point-utf8:compute-length
 273     }
 274     abort "utf-8 encodings larger than 4 bytes are not yet supported"
 275     return 0
 276   }
 277   # prepend trailer bytes
 278   var result/edi: code-point-utf8 <- copy c
 279   var num-byte-shifts/edx: int <- copy 1
 280   {
 281     compare num-trailers, 0
 282     break-if-<=
 283     var tmp/eax: byte <- read-byte in
 284     var tmp2/eax: int <- copy tmp
 285     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
 286     result <- or tmp2
 287     # update loop state
 288     num-byte-shifts <- increment
 289     num-trailers <- decrement
 290     loop
 291   }
 292   return result
 293 }
 294
 295 fn test-read-code-point-utf8 {
 296   var s: (stream byte 0x30)
 297   var s2/ecx: (addr stream byte) <- address s
 298   write s2, "aΒc世d界e"
 299   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 300   var n/eax: int <- copy c
 301   check-ints-equal n, 0x61, "F - test code-point-utf8/0"
 302   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 303   var n/eax: int <- copy c
 304   check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
 305   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 306   var n/eax: int <- copy c
 307   check-ints-equal n, 0x63, "F - test code-point-utf8/2"
 308   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 309   var n/eax: int <- copy c
 310   check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
 311   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 312   var n/eax: int <- copy c
 313   check-ints-equal n, 0x64, "F - test code-point-utf8/4"
 314   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 315   var n/eax: int <- copy c
 316   check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
 317   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
 318   var n/eax: int <- copy c
 319   check-ints-equal n, 0x65, "F - test code-point-utf8/6"
 320 }
 321
 322 fn utf8-length g: code-point-utf8 -> _/edx: int {
 323   {
 324     compare g, 0xff
 325     break-if->
 326     return 1
 327   }
 328   {
 329     compare g, 0xffff
 330     break-if->
 331     return 2
 332   }
 333   {
 334     compare g, 0xffffff
 335     break-if->
 336     return 3
 337   }
 338   return 4
 339 }
 340
 341 # needed because available primitives only shift by a literal/constant number of bits
 342 fn shift-left-bytes n: int, k: int -> _/eax: int {
 343   var i/ecx: int <- copy 0
 344   var result/eax: int <- copy n
 345   {
 346     compare i, k
 347     break-if->=
 348     compare i, 4  # only 4 bytes in 32 bits
 349     break-if->=
 350     result <- shift-left 8
 351     i <- increment
 352     loop
 353   }
 354   return result
 355 }
 356
 357 fn test-shift-left-bytes-0 {
 358   var result/eax: int <- shift-left-bytes 1, 0
 359   check-ints-equal result, 1, "F - shift-left-bytes 0"
 360 }
 361
 362 fn test-shift-left-bytes-1 {
 363   var result/eax: int <- shift-left-bytes 1, 1
 364   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
 365 }
 366
 367 fn test-shift-left-bytes-2 {
 368   var result/eax: int <- shift-left-bytes 1, 2
 369   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
 370 }
 371
 372 fn test-shift-left-bytes-3 {
 373   var result/eax: int <- shift-left-bytes 1, 3
 374   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
 375 }
 376
 377 fn test-shift-left-bytes-4 {
 378   var result/eax: int <- shift-left-bytes 1, 4
 379   check-ints-equal result, 0, "F - shift-left-bytes 4"
 380 }
 381
 382 fn test-shift-left-bytes-5 {
 383   var result/eax: int <- shift-left-bytes 1, 5
 384   check-ints-equal result, 0, "F - shift-left-bytes >4"
 385 }
 386
 387 # write a code-point-utf8 to a stream of bytes
 388 # this is like write-to-stream, except we skip leading 0 bytes
 389 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
 390 $write-code-point-utf8:body: {
 391   var c/eax: int <- copy g
 392   append-byte out, c  # first byte is always written
 393   c <- shift-right 8
 394   compare c, 0
 395   break-if-= $write-code-point-utf8:body
 396   append-byte out, c
 397   c <- shift-right 8
 398   compare c, 0
 399   break-if-= $write-code-point-utf8:body
 400   append-byte out, c
 401   c <- shift-right 8
 402   compare c, 0
 403   break-if-= $write-code-point-utf8:body
 404   append-byte out, c
 405 }
 406 }