403unicode.mu

   1 # Helpers for Unicode.
   2 #
   3 # Mu has no characters, only code points and graphemes.
   4 # Code points are the indivisible atoms of text streams.
   5 #   https://en.wikipedia.org/wiki/Code_point
   6 # Graphemes are the smallest self-contained unit of text.
   7 # Graphemes may consist of multiple code points.
   8 #
   9 # Mu graphemes are always represented in utf-8, and they are required to fit
  10 # in 4 bytes.
  11 #
  12 # Mu doesn't currently support combining code points, or graphemes made of
  13 # multiple code points. One day we will.
  14 # We also don't currently support code points that translate into multiple
  15 # or wide graphemes. (In particular, Tab will never be supported.)
  16
  17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
  18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
  19 #
  20 # The day we want to support combining characters, this function will need to
  21 # take multiple code points. Or something.
  22 fn to-grapheme in: code-point -> _/eax: grapheme {
  23   var c/eax: int <- copy in
  24   var num-trailers/ecx: int <- copy 0
  25   var first/edx: int <- copy 0
  26   $to-grapheme:compute-length: {
  27     # single byte: just return it
  28     compare c, 0x7f
  29     {
  30       break-if->
  31       var g/eax: grapheme <- copy c
  32       return g
  33     }
  34     # 2 bytes
  35     compare c, 0x7ff
  36     {
  37       break-if->
  38       num-trailers <- copy 1
  39       first <- copy 0xc0
  40       break $to-grapheme:compute-length
  41     }
  42     # 3 bytes
  43     compare c, 0xffff
  44     {
  45       break-if->
  46       num-trailers <- copy 2
  47       first <- copy 0xe0
  48       break $to-grapheme:compute-length
  49     }
  50     # 4 bytes
  51     compare c, 0x1fffff
  52     {
  53       break-if->
  54       num-trailers <- copy 3
  55       first <- copy 0xf0
  56       break $to-grapheme:compute-length
  57     }
  58     # more than 4 bytes: unsupported
  59     # TODO: print error message to stderr
  60     compare c, 0x1fffff
  61     {
  62       break-if->
  63       return 0
  64     }
  65   }
  66   # emit trailer bytes, 6 bits from 'in', first two bits '10'
  67   var result/edi: grapheme <- copy 0
  68   {
  69     compare num-trailers, 0
  70     break-if-<=
  71     var tmp/esi: int <- copy c
  72     tmp <- and 0x3f
  73     tmp <- or 0x80
  74     result <- shift-left 8
  75     result <- or tmp
  76     # update loop state
  77     c <- shift-right 6
  78     num-trailers <- decrement
  79     loop
  80   }
  81   # emit engine
  82   result <- shift-left 8
  83   result <- or c
  84   result <- or first
  85   #
  86   return result
  87 }
  88
  89 # TODO: bring in tests once we have check-ints-equal
  90
  91 # read the next grapheme from a stream of bytes
  92 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
  93   # if at eof, return EOF
  94   {
  95     var eof?/eax: boolean <- stream-empty? in
  96     compare eof?, 0/false
  97     break-if-=
  98     return 0xffffffff
  99   }
 100   var c/eax: byte <- read-byte in
 101   var num-trailers/ecx: int <- copy 0
 102   $read-grapheme:compute-length: {
 103     # single byte: just return it
 104     compare c, 0xc0
 105     {
 106       break-if->=
 107       var g/eax: grapheme <- copy c
 108       return g
 109     }
 110     compare c, 0xfe
 111     {
 112       break-if-<
 113       var g/eax: grapheme <- copy c
 114       return g
 115     }
 116     # 2 bytes
 117     compare c, 0xe0
 118     {
 119       break-if->=
 120       num-trailers <- copy 1
 121       break $read-grapheme:compute-length
 122     }
 123     # 3 bytes
 124     compare c, 0xf0
 125     {
 126       break-if->=
 127       num-trailers <- copy 2
 128       break $read-grapheme:compute-length
 129     }
 130     # 4 bytes
 131     compare c, 0xf8
 132     {
 133       break-if->=
 134       num-trailers <- copy 3
 135       break $read-grapheme:compute-length
 136     }
 137     # TODO: print error message
 138     return 0
 139   }
 140   # prepend trailer bytes
 141   var result/edi: grapheme <- copy c
 142   var num-byte-shifts/edx: int <- copy 1
 143   {
 144     compare num-trailers, 0
 145     break-if-<=
 146     var tmp/eax: byte <- read-byte in
 147     var tmp2/eax: int <- copy tmp
 148     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
 149     result <- or tmp2
 150     # update loop state
 151     num-byte-shifts <- increment
 152     num-trailers <- decrement
 153     loop
 154   }
 155   return result
 156 }
 157
 158 # needed because available primitives only shift by a literal/constant number of bits
 159 fn shift-left-bytes n: int, k: int -> _/eax: int {
 160   var i/ecx: int <- copy 0
 161   var result/eax: int <- copy n
 162   {
 163     compare i, k
 164     break-if->=
 165     compare i, 4  # only 4 bytes in 32 bits
 166     break-if->=
 167     result <- shift-left 8
 168     i <- increment
 169     loop
 170   }
 171   return result
 172 }
 173
 174 # write a grapheme to a stream of bytes
 175 # this is like write-to-stream, except we skip leading 0 bytes
 176 fn write-grapheme out: (addr stream byte), g: grapheme {
 177 $write-grapheme:body: {
 178   var c/eax: int <- copy g
 179   append-byte out, c  # first byte is always written
 180   c <- shift-right 8
 181   compare c, 0
 182   break-if-= $write-grapheme:body
 183   append-byte out, c
 184   c <- shift-right 8
 185   compare c, 0
 186   break-if-= $write-grapheme:body
 187   append-byte out, c
 188   c <- shift-right 8
 189   compare c, 0
 190   break-if-= $write-grapheme:body
 191   append-byte out, c
 192 }
 193 }