1 module Distribution
.Utils
.String
2 ( -- * Encode to/from UTF8
9 import Data
.Char (chr, ord)
10 import Data
.List
(dropWhileEnd
)
12 import GHC
.Unicode
(isSpace)
14 -- | Decode 'String' from UTF8-encoded octets.
16 -- Invalid data in the UTF8 stream (this includes code-points @U+D800@
17 -- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
19 -- See also 'encodeStringUtf8'
20 decodeStringUtf8
:: [Word8
] -> String
23 go
:: [Word8
] -> String
26 | c
<= 0x7F = chr (fromIntegral c
) : go cs
27 | c
<= 0xBF = replacementChar
: go cs
28 | c
<= 0xDF = twoBytes c cs
29 | c
<= 0xEF = moreBytes
3 0x800 cs
(fromIntegral $ c
.&. 0xF)
30 | c
<= 0xF7 = moreBytes
4 0x10000 cs
(fromIntegral $ c
.&. 0x7)
31 | c
<= 0xFB = moreBytes
5 0x200000 cs
(fromIntegral $ c
.&. 0x3)
32 | c
<= 0xFD = moreBytes
6 0x4000000 cs
(fromIntegral $ c
.&. 0x1)
33 |
otherwise = replacementChar
: go cs
35 twoBytes
:: Word8
-> [Word8
] -> String
36 twoBytes c0
(c1
: cs
')
37 | c1
.&. 0xC0 == 0x80 =
39 (fromIntegral (c0
.&. 0x1F) `shiftL`
6)
40 .|
. fromIntegral (c1
.&. 0x3F)
43 else replacementChar
: go cs
'
44 twoBytes _ cs
' = replacementChar
: go cs
'
46 moreBytes
:: Int -> Int -> [Word8
] -> Int -> [Char]
47 moreBytes
1 overlong cs
' acc
50 , acc
< 0xD800 ||
0xDFFF < acc
=
53 replacementChar
: go cs
'
54 moreBytes byteCount overlong
(cn
: cs
') acc
55 | cn
.&. 0xC0 == 0x80 =
60 ((acc `shiftL`
6) .|
. fromIntegral cn
.&. 0x3F)
62 replacementChar
: go cs
'
64 replacementChar
= '\xfffd
'
66 -- | Encode 'String' to a list of UTF8-encoded octets
68 -- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
69 -- as the replacement character (i.e. @U+FFFD@).
71 -- See also 'decodeUtf8'
72 encodeStringUtf8
:: String -> [Word8
]
73 encodeStringUtf8
[] = []
74 encodeStringUtf8
(c
: cs
)
80 : (0x80 .|
. (w8
.&. 0x3F))
83 (0xE0 .|
. w8ShiftR
12)
84 : (0x80 .|
. (w8ShiftR
6 .&. 0x3F))
85 : (0x80 .|
. (w8
.&. 0x3F))
93 (0xE0 .|
. w8ShiftR
12)
94 : (0x80 .|
. (w8ShiftR
6 .&. 0x3F))
95 : (0x80 .|
. (w8
.&. 0x3F))
98 (0xf0 .|
. w8ShiftR
18)
99 : (0x80 .|
. (w8ShiftR
12 .&. 0x3F))
100 : (0x80 .|
. (w8ShiftR
6 .&. 0x3F))
101 : (0x80 .|
. (w8
.&. 0x3F))
102 : encodeStringUtf8 cs
104 w8
= fromIntegral (ord c
) :: Word8
105 w8ShiftR
:: Int -> Word8
106 w8ShiftR
= fromIntegral . shiftR
(ord c
)
109 trim
:: String -> String
110 trim
= dropWhile isSpace . dropWhileEnd
isSpace