src/UTF8.lhs

   1 Copyright (c) 2002, members of the Haskell Internationalisation Working
   2 Group All rights reserved.
   3
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are met:
   6
   7 * Redistributions of source code must retain the above copyright notice,
   8    this list of conditions and the following disclaimer.
   9 * Redistributions in binary form must reproduce the above copyright notice,
  10    this list of conditions and the following disclaimer in the
  11    documentation and/or other materials provided with the distribution.
  12 * Neither the name of the Haskell Internationalisation Working Group nor
  13    the names of its contributors may be used to endorse or promote products
  14    derived from this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 This module provides lazy stream encoding/decoding facilities for UTF-8,
  29 the Unicode Transformation Format with 8-bit words.
  30
  31 2002-09-02  Sven Moritz Hallberg <pesco@gmx.de>
  32
  33
  34 > module UTF8
  35 >   ( encode ) where
  36
  37 > import Data.Char (ord)
  38 > import Data.Word (Word8, Word16, Word32)
  39 > import Data.Bits (Bits, shiftR, (.&.), (.|.))
  40
  41
  42
  43 ///- UTF-8 in General -///
  44
  45 Adapted from the Unicode standard, version 3.2,
  46 Table 3.1 "UTF-8 Bit Distribution" (excluded are UTF-16 encodings):
  47
  48   Scalar                    1st Byte  2nd Byte  3rd Byte  4th Byte
  49           000000000xxxxxxx  0xxxxxxx
  50           00000yyyyyxxxxxx  110yyyyy  10xxxxxx
  51           zzzzyyyyyyxxxxxx  1110zzzz  10yyyyyy  10xxxxxx
  52   000uuuzzzzzzyyyyyyxxxxxx  11110uuu  10zzzzzz  10yyyyyy  10xxxxxx
  53
  54 Also from the Unicode standard, version 3.2,
  55 Table 3.1B "Legal UTF-8 Byte Sequences":
  56
  57   Code Points         1st Byte  2nd Byte  3rd Byte  4th Byte
  58     U+0000..U+007F    00..7F
  59     U+0080..U+07FF    C2..DF    80..BF
  60     U+0800..U+0FFF    E0        A0..BF    80..BF
  61     U+1000..U+CFFF    E1..EC    80..BF    80..BF
  62     U+D000..U+D7FF    ED        80..9F    80..BF
  63     U+D800..U+DFFF    ill-formed
  64     U+E000..U+FFFF    EE..EF    80..BF    80..BF
  65    U+10000..U+3FFFF   F0        90..BF    80..BF    80..BF
  66    U+40000..U+FFFFF   F1..F3    80..BF    80..BF    80..BF
  67   U+100000..U+10FFFF  F4        80..8F    80..BF    80..BF
  68
  69
  70
  71 ///- Encoding Functions -///
  72
  73 Must the encoder ensure that no illegal byte sequences are output or
  74 can we trust the Haskell system to supply only legal values?
  75 For now I include error case for the surrogate values U+D800..U+DFFF and
  76 out-of-range scalars.
  77
  78 The function is pretty much a transscript of table 3.1B with error checks.
  79 It dispatches the actual encoding to functions specific to the number of
  80 required bytes.
  81
  82 > encodeOne :: Char -> [Word8]
  83 > encodeOne c
  84 >-- The report guarantees in (6.1.2) that this won't happen:
  85 >--   | n < 0       = error "encodeUTF8: ord returned a negative value"
  86 >     | n < 0x0080  = encodeOne_onebyte n8
  87 >     | n < 0x0800  = encodeOne_twobyte n16
  88 >     | n < 0xD800  = encodeOne_threebyte n16
  89 >     | n < 0xE000  = error "encodeUTF8: ord returned a surrogate value"
  90 >     | n < 0x10000       = encodeOne_threebyte n16
  91 >-- Haskell 98 only talks about 16 bit characters, but ghc handles 20.1.
  92 >     | n < 0x10FFFF      = encodeOne_fourbyte n32
  93 >     | otherwise  = error "encodeUTF8: ord returned a value above 0x10FFFF"
  94 >     where
  95 >     n = ord c            :: Int
  96 >     n8 = fromIntegral n  :: Word8
  97 >     n16 = fromIntegral n :: Word16
  98 >     n32 = fromIntegral n :: Word32
  99
 100
 101 With the above, a stream decoder is trivial:
 102
 103 > encode :: [Char] -> [Word8]
 104 > encode = concatMap encodeOne
 105
 106
 107 Now follow the individual encoders for certain numbers of bytes...
 108           _
 109          / |  __  ___  __ __
 110         / ^| //  /__/ // //
 111        /.==| \\ //_  // //
 112 It's  //  || // \_/_//_//_  and it's here to stay!
 113
 114 > encodeOne_onebyte :: Word8 -> [Word8]
 115 > encodeOne_onebyte cp = [cp]
 116
 117
 118 00000yyyyyxxxxxx -> 110yyyyy 10xxxxxx
 119
 120 > encodeOne_twobyte :: Word16 -> [Word8]
 121 > encodeOne_twobyte cp = [(0xC0.|.ys), (0x80.|.xs)]
 122 >     where
 123 >     xs, ys :: Word8
 124 >     ys = fromIntegral (shiftR cp 6)
 125 >     xs = (fromIntegral cp) .&. 0x3F
 126
 127
 128 zzzzyyyyyyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
 129
 130 > encodeOne_threebyte :: Word16 -> [Word8]
 131 > encodeOne_threebyte cp = [(0xE0.|.zs), (0x80.|.ys), (0x80.|.xs)]
 132 >     where
 133 >     xs, ys, zs :: Word8
 134 >     xs = (fromIntegral cp) .&. 0x3F
 135 >     ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
 136 >     zs = fromIntegral (shiftR cp 12)
 137
 138
 139 000uuuzzzzzzyyyyyyxxxxxx -> 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
 140
 141 > encodeOne_fourbyte :: Word32 -> [Word8]
 142 > encodeOne_fourbyte cp = [0xF0.|.us, 0x80.|.zs, 0x80.|.ys, 0x80.|.xs]
 143 >     where
 144 >     xs, ys, zs, us :: Word8
 145 >     xs = (fromIntegral cp) .&. 0x3F
 146 >     ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
 147 >     zs = (fromIntegral (shiftR cp 12)) .&. 0x3F
 148 >     us = fromIntegral (shiftR cp 18)