make LTS branch pre-releases
[cabal.git] / Cabal-syntax / src / Distribution / Utils / String.hs
blob13b022f812cf7ee9694298e04f88e5d13516a048
1 module Distribution.Utils.String
2 ( -- * Encode to/from UTF8
3 decodeStringUtf8
4 , encodeStringUtf8
5 , trim
6 ) where
8 import Data.Bits
9 import Data.Char (chr, ord)
10 import Data.List (dropWhileEnd)
11 import Data.Word
12 import GHC.Unicode (isSpace)
14 -- | Decode 'String' from UTF8-encoded octets.
16 -- Invalid data in the UTF8 stream (this includes code-points @U+D800@
17 -- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
19 -- See also 'encodeStringUtf8'
20 decodeStringUtf8 :: [Word8] -> String
21 decodeStringUtf8 = go
22 where
23 go :: [Word8] -> String
24 go [] = []
25 go (c : cs)
26 | c <= 0x7F = chr (fromIntegral c) : go cs
27 | c <= 0xBF = replacementChar : go cs
28 | c <= 0xDF = twoBytes c cs
29 | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
30 | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
31 | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
32 | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
33 | otherwise = replacementChar : go cs
35 twoBytes :: Word8 -> [Word8] -> String
36 twoBytes c0 (c1 : cs')
37 | c1 .&. 0xC0 == 0x80 =
38 let d =
39 (fromIntegral (c0 .&. 0x1F) `shiftL` 6)
40 .|. fromIntegral (c1 .&. 0x3F)
41 in if d >= 0x80
42 then chr d : go cs'
43 else replacementChar : go cs'
44 twoBytes _ cs' = replacementChar : go cs'
46 moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
47 moreBytes 1 overlong cs' acc
48 | overlong <= acc
49 , acc <= 0x10FFFF
50 , acc < 0xD800 || 0xDFFF < acc =
51 chr acc : go cs'
52 | otherwise =
53 replacementChar : go cs'
54 moreBytes byteCount overlong (cn : cs') acc
55 | cn .&. 0xC0 == 0x80 =
56 moreBytes
57 (byteCount - 1)
58 overlong
59 cs'
60 ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
61 moreBytes _ _ cs' _ =
62 replacementChar : go cs'
64 replacementChar = '\xfffd'
66 -- | Encode 'String' to a list of UTF8-encoded octets
68 -- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
69 -- as the replacement character (i.e. @U+FFFD@).
71 -- See also 'decodeUtf8'
72 encodeStringUtf8 :: String -> [Word8]
73 encodeStringUtf8 [] = []
74 encodeStringUtf8 (c : cs)
75 | c <= '\x07F' =
77 : encodeStringUtf8 cs
78 | c <= '\x7FF' =
79 (0xC0 .|. w8ShiftR 6)
80 : (0x80 .|. (w8 .&. 0x3F))
81 : encodeStringUtf8 cs
82 | c <= '\xD7FF' =
83 (0xE0 .|. w8ShiftR 12)
84 : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
85 : (0x80 .|. (w8 .&. 0x3F))
86 : encodeStringUtf8 cs
87 | c <= '\xDFFF' =
88 0xEF
89 : 0xBF
90 : 0xBD -- U+FFFD
91 : encodeStringUtf8 cs
92 | c <= '\xFFFF' =
93 (0xE0 .|. w8ShiftR 12)
94 : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
95 : (0x80 .|. (w8 .&. 0x3F))
96 : encodeStringUtf8 cs
97 | otherwise =
98 (0xf0 .|. w8ShiftR 18)
99 : (0x80 .|. (w8ShiftR 12 .&. 0x3F))
100 : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
101 : (0x80 .|. (w8 .&. 0x3F))
102 : encodeStringUtf8 cs
103 where
104 w8 = fromIntegral (ord c) :: Word8
105 w8ShiftR :: Int -> Word8
106 w8ShiftR = fromIntegral . shiftR (ord c)
108 -- @since 3.8.0.0
109 trim :: String -> String
110 trim = dropWhile isSpace . dropWhileEnd isSpace