src/UTF8.lhs

   1 Copyright (c) 2002, members of the Haskell Internationalisation Working
   2 Group All rights reserved.
   3
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are met:
   6
   7 * Redistributions of source code must retain the above copyright notice,
   8    this list of conditions and the following disclaimer.
   9 * Redistributions in binary form must reproduce the above copyright notice,
  10    this list of conditions and the following disclaimer in the
  11    documentation and/or other materials provided with the distribution.
  12 * Neither the name of the Haskell Internationalisation Working Group nor
  13    the names of its contributors may be used to endorse or promote products
  14    derived from this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 This module provides lazy stream encoding/decoding facilities for UTF-8,
  29 the Unicode Transformation Format with 8-bit words.
  30
  31 2002-09-02  Sven Moritz Hallberg <pesco@gmx.de>
  32
  33
  34 > module UTF8
  35 >   ( encode, decode,
  36 >     encodeOne, decodeOne,
  37 >   ) where
  38
  39 > import Data.Char (ord, chr)
  40 > import Data.Word (Word8, Word16, Word32)
  41 > import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
  42
  43
  44
  45 ///- UTF-8 in General -///
  46
  47 Adapted from the Unicode standard, version 3.2,
  48 Table 3.1 "UTF-8 Bit Distribution" (excluded are UTF-16 encodings):
  49
  50   Scalar                    1st Byte  2nd Byte  3rd Byte  4th Byte
  51           000000000xxxxxxx  0xxxxxxx
  52           00000yyyyyxxxxxx  110yyyyy  10xxxxxx
  53           zzzzyyyyyyxxxxxx  1110zzzz  10yyyyyy  10xxxxxx
  54   000uuuzzzzzzyyyyyyxxxxxx  11110uuu  10zzzzzz  10yyyyyy  10xxxxxx
  55
  56 Also from the Unicode standard, version 3.2,
  57 Table 3.1B "Legal UTF-8 Byte Sequences":
  58
  59   Code Points         1st Byte  2nd Byte  3rd Byte  4th Byte
  60     U+0000..U+007F    00..7F
  61     U+0080..U+07FF    C2..DF    80..BF
  62     U+0800..U+0FFF    E0        A0..BF    80..BF
  63     U+1000..U+CFFF    E1..EC    80..BF    80..BF
  64     U+D000..U+D7FF    ED        80..9F    80..BF
  65     U+D800..U+DFFF    ill-formed
  66     U+E000..U+FFFF    EE..EF    80..BF    80..BF
  67    U+10000..U+3FFFF   F0        90..BF    80..BF    80..BF
  68    U+40000..U+FFFFF   F1..F3    80..BF    80..BF    80..BF
  69   U+100000..U+10FFFF  F4        80..8F    80..BF    80..BF
  70
  71
  72
  73 ///- Encoding Functions -///
  74
  75 Must the encoder ensure that no illegal byte sequences are output or
  76 can we trust the Haskell system to supply only legal values?
  77 For now I include error case for the surrogate values U+D800..U+DFFF and
  78 out-of-range scalars.
  79
  80 The function is pretty much a transscript of table 3.1B with error checks.
  81 It dispatches the actual encoding to functions specific to the number of
  82 required bytes.
  83
  84 > encodeOne :: Char -> [Word8]
  85 > encodeOne c
  86 >-- The report guarantees in (6.1.2) that this won't happen:
  87 >--   | n < 0       = error "encodeUTF8: ord returned a negative value"
  88 >     | n < 0x0080  = encodeOne_onebyte n8
  89 >     | n < 0x0800  = encodeOne_twobyte n16
  90 >     | n < 0xD800  = encodeOne_threebyte n16
  91 >     | n < 0xE000  = error "encodeUTF8: ord returned a surrogate value"
  92 >     | n < 0x10000       = encodeOne_threebyte n16
  93 >-- Haskell 98 only talks about 16 bit characters, but ghc handles 20.1.
  94 >     | n < 0x10FFFF      = encodeOne_fourbyte n32
  95 >     | otherwise  = error "encodeUTF8: ord returned a value above 0x10FFFF"
  96 >     where
  97 >     n = ord c            :: Int
  98 >     n8 = fromIntegral n  :: Word8
  99 >     n16 = fromIntegral n :: Word16
 100 >     n32 = fromIntegral n :: Word32
 101
 102
 103 With the above, a stream decoder is trivial:
 104
 105 > encode :: [Char] -> [Word8]
 106 > encode = concatMap encodeOne
 107
 108
 109 Now follow the individual encoders for certain numbers of bytes...
 110           _
 111          / |  __  ___  __ __
 112         / ^| //  /__/ // //
 113        /.==| \\ //_  // //
 114 It's  //  || // \_/_//_//_  and it's here to stay!
 115
 116 > encodeOne_onebyte :: Word8 -> [Word8]
 117 > encodeOne_onebyte cp = [cp]
 118
 119
 120 00000yyyyyxxxxxx -> 110yyyyy 10xxxxxx
 121
 122 > encodeOne_twobyte :: Word16 -> [Word8]
 123 > encodeOne_twobyte cp = [(0xC0.|.ys), (0x80.|.xs)]
 124 >     where
 125 >     xs, ys :: Word8
 126 >     ys = fromIntegral (shiftR cp 6)
 127 >     xs = (fromIntegral cp) .&. 0x3F
 128
 129
 130 zzzzyyyyyyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
 131
 132 > encodeOne_threebyte :: Word16 -> [Word8]
 133 > encodeOne_threebyte cp = [(0xE0.|.zs), (0x80.|.ys), (0x80.|.xs)]
 134 >     where
 135 >     xs, ys, zs :: Word8
 136 >     xs = (fromIntegral cp) .&. 0x3F
 137 >     ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
 138 >     zs = fromIntegral (shiftR cp 12)
 139
 140
 141 000uuuzzzzzzyyyyyyxxxxxx -> 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
 142
 143 > encodeOne_fourbyte :: Word32 -> [Word8]
 144 > encodeOne_fourbyte cp = [0xF0.|.us, 0x80.|.zs, 0x80.|.ys, 0x80.|.xs]
 145 >     where
 146 >     xs, ys, zs, us :: Word8
 147 >     xs = (fromIntegral cp) .&. 0x3F
 148 >     ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
 149 >     zs = (fromIntegral (shiftR cp 12)) .&. 0x3F
 150 >     us = fromIntegral (shiftR cp 18)
 151
 152
 153
 154 ///- Decoding -///
 155
 156 The decoding is a bit more involved. The byte sequence could contain all
 157 sorts of corruptions. The user must be able to either notice or ignore these
 158 errors.
 159
 160 I will first look at the decoding of a single character. The process
 161 consumes a certain number of bytes from the input. It returns the
 162 remaining input and either an error and the index of its occurance in the
 163 byte sequence or the decoded character.
 164
 165 > data Error
 166
 167 The first byte in a sequence starts with either zero, two, three, or four
 168 ones and one zero to indicate the length of the sequence. If it doesn't,
 169 it is invalid. It is dropped and the next byte interpreted as the start
 170 of a new sequence.
 171
 172 >     = InvalidFirstByte
 173
 174 All bytes in the sequence except the first match the bit pattern 10xxxxxx.
 175 If one doesn't, it is invalid. The sequence up to that point is dropped
 176 and the "invalid" byte interpreted as the start of a new sequence. The error
 177 includes the length of the partial sequence and the number of expected bytes.
 178
 179 >     | InvalidLaterByte Int      -- the byte at relative index n was invalid
 180
 181 If a sequence ends prematurely, it has been truncated. It dropped and
 182 decoding stops. The error reports the actual and expected lengths of the
 183 sequence.
 184
 185 >     | Truncated Int Int         -- only n of m expected bytes were present
 186
 187 Some sequences would represent code points which would be encoded as a
 188 shorter sequence by a conformant encoder. Such non-shortest sequences are
 189 considered erroneous and dropped. The error reports the actual and
 190 expected number of bytes used.
 191
 192 >     | NonShortest Int Int       -- n instead of m bytes were used
 193
 194 Unicode code points are in the range of [0..0x10FFFF]. Any values outside
 195 of those bounds are simply invalid.
 196
 197 >     | ValueOutOfBounds
 198
 199 There is no such thing as "surrogate pairs" any more in UTF-8. The
 200 corresponding code points now form illegal byte sequences.
 201
 202 >     | Surrogate
 203 >       deriving (Show, Eq)
 204
 205
 206 Second, third, and fourth bytes share the common requirement to start
 207 with the bit sequence 10. So, here's the function to check that property.
 208
 209 > first_bits_not_10 :: Word8 -> Bool
 210 > first_bits_not_10 b
 211 >     | (b.&.0xC0) /= 0x80  = True
 212 >     | otherwise           = False
 213
 214
 215 Erm, OK, the single-character decoding function's return type is a bit
 216 longish. It is a tripel:
 217
 218  - The first component contains the decoded character or an error
 219    if the byte sequence was erroneous.
 220  - The second component contains the number of bytes that were consumed
 221    from the input.
 222  - The third component contains the remaining bytes of input.
 223
 224 > decodeOne :: [Word8] -> (Either Error Char, Int, [Word8])
 225 > decodeOne bs@(b1:rest)
 226 >     | b1 < 0x80   = decodeOne_onebyte bs
 227 >     | b1 < 0xC0   = (Left InvalidFirstByte, 1, rest)
 228 >     | b1 < 0xE0   = decodeOne_twobyte bs
 229 >     | b1 < 0xEE   = decodeOne_threebyte bs
 230 >     | b1 < 0xF5   = decodeOne_fourbyte bs
 231 >     | otherwise   = (Left ValueOutOfBounds, 1, rest)
 232 > decodeOne [] = error "UTF8.decodeOne: No input"
 233
 234
 235 0xxxxxxx -> 000000000xxxxxxx
 236
 237 > decodeOne_onebyte :: [Word8] -> (Either Error Char, Int, [Word8])
 238 > decodeOne_onebyte (b:bs) = (Right (cpToChar b), 1, bs)
 239 > decodeOne_onebyte[] = error "UTF8.decodeOne_onebyte: No input (can't happen)"
 240
 241 > cpToChar :: Integral a => a -> Char
 242 > cpToChar = chr . fromIntegral
 243
 244
 245 110yyyyy 10xxxxxx -> 00000yyyyyxxxxxx
 246
 247 > decodeOne_twobyte :: [Word8] -> (Either Error Char, Int, [Word8])
 248 > decodeOne_twobyte (_:[])
 249 >     = (Left (Truncated 1 2), 1, [])
 250 > decodeOne_twobyte (b1:b2:bs)
 251 >     | b1 < 0xC2            = (Left (NonShortest 2 1), 2, bs)
 252 >     | first_bits_not_10 b2 = (Left (InvalidLaterByte 1), 1, (b2:bs))
 253 >     | otherwise            = (Right (cpToChar result), 2, bs)
 254 >     where
 255 >     xs, ys, result :: Word32
 256 >     xs = fromIntegral (b2.&.0x3F)
 257 >     ys = fromIntegral (b1.&.0x1F)
 258 >     result = shiftL ys 6 .|. xs
 259 > decodeOne_twobyte[] = error "UTF8.decodeOne_twobyte: No input (can't happen)"
 260
 261
 262 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyyyyxxxxxx
 263
 264 > decodeOne_threebyte :: [Word8] -> (Either Error Char, Int, [Word8])
 265 > decodeOne_threebyte (_:[])   = threebyte_truncated 1
 266 > decodeOne_threebyte (_:_:[]) = threebyte_truncated 2
 267 > decodeOne_threebyte bs@(b1:b2:b3:rest)
 268 >     | first_bits_not_10 b2
 269 >         = (Left (InvalidLaterByte 1), 1, drop 1 bs)
 270 >     | first_bits_not_10 b3
 271 >         = (Left (InvalidLaterByte 2), 2, drop 2 bs)
 272 >     | result < 0x0080
 273 >         = (Left (NonShortest 3 1), 3, rest)
 274 >     | result < 0x0800
 275 >         = (Left (NonShortest 3 2), 3, rest)
 276 >     | result >= 0xD800 && result < 0xE000
 277 >         = (Left Surrogate, 3, rest)
 278 >     | otherwise
 279 >         = (Right (cpToChar result), 3, rest)
 280 >     where
 281 >     xs, ys, zs, result :: Word32
 282 >     xs = fromIntegral (b3.&.0x3F)
 283 >     ys = fromIntegral (b2.&.0x3F)
 284 >     zs = fromIntegral (b1.&.0x0F)
 285 >     result = shiftL zs 12 .|. shiftL ys 6 .|. xs
 286 > decodeOne_threebyte[]
 287 >  = error "UTF8.decodeOne_threebyte: No input (can't happen)"
 288
 289 > threebyte_truncated :: Int -> (Either Error Char, Int, [Word8])
 290 > threebyte_truncated n = (Left (Truncated n 3), n, [])
 291
 292
 293 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx -> 000uuuzzzzzzyyyyyyxxxxxx
 294
 295 > decodeOne_fourbyte :: [Word8] -> (Either Error Char, Int, [Word8])
 296 > decodeOne_fourbyte (_:[])     = fourbyte_truncated 1
 297 > decodeOne_fourbyte (_:_:[])   = fourbyte_truncated 2
 298 > decodeOne_fourbyte (_:_:_:[]) = fourbyte_truncated 3
 299 > decodeOne_fourbyte bs@(b1:b2:b3:b4:rest)
 300 >     | first_bits_not_10 b2
 301 >         = (Left (InvalidLaterByte 1), 1, drop 1 bs)
 302 >     | first_bits_not_10 b3
 303 >         = (Left (InvalidLaterByte 2), 2, drop 2 bs)
 304 >     | first_bits_not_10 b4
 305 >         = (Left (InvalidLaterByte 3), 3, drop 3 bs)
 306 >     | result < 0x0080
 307 >         = (Left (NonShortest 4 1), 4, rest)
 308 >     | result < 0x0800
 309 >         = (Left (NonShortest 4 2), 4, rest)
 310 >     | result < 0x10000
 311 >         = (Left (NonShortest 4 3), 4, rest)
 312 >     | result > 0x10FFFF
 313 >         = (Left ValueOutOfBounds, 4, rest)
 314 >     | otherwise
 315 >         = (Right (cpToChar result), 4, rest)
 316 >     where
 317 >     xs, ys, zs, us, result :: Word32
 318 >     xs = fromIntegral (b4 .&. 0x3F)
 319 >     ys = fromIntegral (b3 .&. 0x3F)
 320 >     zs = fromIntegral (b2 .&. 0x3F)
 321 >     us = fromIntegral (b1 .&. 0x07)
 322 >     result = xs .|. shiftL ys 6 .|. shiftL zs 12 .|. shiftL us 18
 323 > decodeOne_fourbyte[]
 324 >  = error "UTF8.decodeOne_fourbyte: No input (can't happen)"
 325
 326 > fourbyte_truncated :: Int -> (Either Error Char, Int, [Word8])
 327 > fourbyte_truncated n = (Left (Truncated n 4), n, [])
 328
 329
 330 The decoder examines all input, recording decoded characters as well as
 331 error-index pairs along the way.
 332
 333 > decode :: [Word8] -> ([Char], [(Error,Int)])
 334 > decode bytes = iter 0 [] [] bytes
 335 >     where
 336 >     iter :: Int -> [Char] -> [(Error,Int)] -> [Word8]
 337 >          -> ([Char], [(Error,Int)])
 338 >     iter _ cs es [] = (reverse cs, reverse es)
 339 >     iter idx cs es bs
 340 >         = case decodeOne bs of
 341 >           (Left e, n, rest)  -> iter (idx+n) cs     ((e,idx):es) rest
 342 >           (Right c, n, rest) -> iter (idx+n) (c:cs) es           rest
 343