Cabal-syntax/src/Distribution/Utils/String.hs

   1 module Distribution.Utils.String
   2   ( -- * Encode to/from UTF8
   3     decodeStringUtf8
   4   , encodeStringUtf8
   5   , trim
   6   ) where
   7
   8 import Data.Bits
   9 import Data.Char (chr, ord)
  10 import Data.List (dropWhileEnd)
  11 import Data.Word
  12 import GHC.Unicode (isSpace)
  13
  14 -- | Decode 'String' from UTF8-encoded octets.
  15 --
  16 -- Invalid data in the UTF8 stream (this includes code-points @U+D800@
  17 -- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
  18 --
  19 -- See also 'encodeStringUtf8'
  20 decodeStringUtf8 :: [Word8] -> String
  21 decodeStringUtf8 = go
  22   where
  23     go :: [Word8] -> String
  24     go [] = []
  25     go (c : cs)
  26       | c <= 0x7F = chr (fromIntegral c) : go cs
  27       | c <= 0xBF = replacementChar : go cs
  28       | c <= 0xDF = twoBytes c cs
  29       | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
  30       | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
  31       | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
  32       | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
  33       | otherwise = replacementChar : go cs
  34
  35     twoBytes :: Word8 -> [Word8] -> String
  36     twoBytes c0 (c1 : cs')
  37       | c1 .&. 0xC0 == 0x80 =
  38           let d =
  39                 (fromIntegral (c0 .&. 0x1F) `shiftL` 6)
  40                   .|. fromIntegral (c1 .&. 0x3F)
  41            in if d >= 0x80
  42                 then chr d : go cs'
  43                 else replacementChar : go cs'
  44     twoBytes _ cs' = replacementChar : go cs'
  45
  46     moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
  47     moreBytes 1 overlong cs' acc
  48       | overlong <= acc
  49       , acc <= 0x10FFFF
  50       , acc < 0xD800 || 0xDFFF < acc =
  51           chr acc : go cs'
  52       | otherwise =
  53           replacementChar : go cs'
  54     moreBytes byteCount overlong (cn : cs') acc
  55       | cn .&. 0xC0 == 0x80 =
  56           moreBytes
  57             (byteCount - 1)
  58             overlong
  59             cs'
  60             ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
  61     moreBytes _ _ cs' _ =
  62       replacementChar : go cs'
  63
  64     replacementChar = '\xfffd'
  65
  66 -- | Encode 'String' to a list of UTF8-encoded octets
  67 --
  68 -- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
  69 -- as the replacement character (i.e. @U+FFFD@).
  70 --
  71 -- See also 'decodeUtf8'
  72 encodeStringUtf8 :: String -> [Word8]
  73 encodeStringUtf8 [] = []
  74 encodeStringUtf8 (c : cs)
  75   | c <= '\x07F' =
  76       w8
  77         : encodeStringUtf8 cs
  78   | c <= '\x7FF' =
  79       (0xC0 .|. w8ShiftR 6)
  80         : (0x80 .|. (w8 .&. 0x3F))
  81         : encodeStringUtf8 cs
  82   | c <= '\xD7FF' =
  83       (0xE0 .|. w8ShiftR 12)
  84         : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
  85         : (0x80 .|. (w8 .&. 0x3F))
  86         : encodeStringUtf8 cs
  87   | c <= '\xDFFF' =
  88       0xEF
  89         : 0xBF
  90         : 0xBD -- U+FFFD
  91         : encodeStringUtf8 cs
  92   | c <= '\xFFFF' =
  93       (0xE0 .|. w8ShiftR 12)
  94         : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
  95         : (0x80 .|. (w8 .&. 0x3F))
  96         : encodeStringUtf8 cs
  97   | otherwise =
  98       (0xf0 .|. w8ShiftR 18)
  99         : (0x80 .|. (w8ShiftR 12 .&. 0x3F))
 100         : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
 101         : (0x80 .|. (w8 .&. 0x3F))
 102         : encodeStringUtf8 cs
 103   where
 104     w8 = fromIntegral (ord c) :: Word8
 105     w8ShiftR :: Int -> Word8
 106     w8ShiftR = fromIntegral . shiftR (ord c)
 107
 108 -- @since 3.8.0.0
 109 trim :: String -> String
 110 trim = dropWhile isSpace . dropWhileEnd isSpace