1 {-# LANGUAGE BangPatterns #-}
3 -- | Sets of characters.
5 -- Using this is more efficient than 'RE.Type.Alt':ng individual characters.
6 module Distribution
.Utils
.CharSet
(
33 import Data
.Char (chr, isAlpha, isAlphaNum, isUpper, ord)
34 import Data
.List
(foldl', sortBy)
35 import Data
.Monoid
(Monoid
(..))
36 import Data
.String (IsString
(..))
37 import Distribution
.Compat
.Semigroup
(Semigroup
(..))
39 (Bool (..), Bounded
(..), Char, Enum
(..), Eq
(..), Int, Maybe (..), Num
(..), Ord
(..), Show (..), String, concatMap, flip, fst, otherwise, showParen,
40 showString, uncurry, ($), (.))
42 #if MIN_VERSION_containers
(0,5,0)
43 import qualified Data
.IntMap
.Strict
as IM
45 import qualified Data
.IntMap
as IM
48 -- | A set of 'Char's.
50 -- We use range set, which works great with 'Char'.
51 newtype CharSet
= CS
{ unCS
:: IM
.IntMap
Int }
54 instance IsString CharSet
where
57 instance Show CharSet
where
60 = showsPrec d
(toList cs
)
64 . showsPrec 11 (unCS cs
)
66 instance Semigroup CharSet
where
69 instance Monoid CharSet
where
73 -- | Empty character set.
79 universe
= CS
$ IM
.singleton
0 0x10ffff
81 -- | Check whether 'CharSet' is 'empty'.
82 null :: CharSet
-> Bool
83 null (CS cs
) = IM
.null cs
85 -- | Size of 'CharSet'
87 -- >>> size $ fromIntervalList [('a','f'), ('0','9')]
90 -- >>> length $ toList $ fromIntervalList [('a','f'), ('0','9')]
93 size
:: CharSet
-> Int
94 size
(CS m
) = foldl' (\ !acc
(lo
, hi
) -> acc
+ (hi
- lo
) + 1) 0 (IM
.toList m
)
96 -- | Singleton character set.
97 singleton
:: Char -> CharSet
98 singleton c
= CS
(IM
.singleton
(ord c
) (ord c
))
100 -- | Test whether character is in the set.
101 member
:: Char -> CharSet
-> Bool
102 #if MIN_VERSION_containers
(0,5,0)
103 member c
(CS m
) = case IM
.lookupLE i m
of
105 Just
(_
, hi
) -> i
<= hi
108 member c
(CS m
) = go
(IM
.toList m
)
111 go
((x
,y
):zs
) = (x
<= i
&& i
<= y
) || go zs
115 -- | Insert 'Char' into 'CharSet'.
116 insert :: Char -> CharSet
-> CharSet
117 insert c
(CS m
) = normalise
(IM
.insert (ord c
) (ord c
) m
)
119 -- | Union of two 'CharSet's.
120 union :: CharSet
-> CharSet
-> CharSet
121 union (CS xs
) (CS ys
) = normalise
(IM
.unionWith
max xs ys
)
123 -- | Intersection of two 'CharSet's
124 intersection
:: CharSet
-> CharSet
-> CharSet
125 intersection
(CS xs
) (CS ys
) = CS
$
126 IM
.fromList
(intersectRangeList
(IM
.toList xs
) (IM
.toList ys
))
128 -- | Compute the intersection.
129 intersectRangeList
:: Ord a
=> [(a
, a
)] -> [(a
, a
)] -> [(a
, a
)]
130 intersectRangeList aset
@((x
,y
):as) bset
@((u
,v
):bs
)
131 | y
< u
= intersectRangeList
as bset
132 | v
< x
= intersectRangeList aset bs
133 | y
< v
= (max x u
, y
) : intersectRangeList
as bset
134 |
otherwise = (max x u
, v
) : intersectRangeList aset bs
135 intersectRangeList _
[] = []
136 intersectRangeList
[] _
= []
138 -- | Complement of a CharSet
139 complement
:: CharSet
-> CharSet
140 complement
(CS xs
) = CS
$ IM
.fromList
$ complementRangeList
(IM
.toList xs
)
142 -- | Compute the complement intersected with @[x,)@ assuming @x<u@.
143 complementRangeList
' :: Int -> [(Int, Int)] -> [(Int, Int)]
144 complementRangeList
' x
((u
,v
):s
) = (x
,pred u
) : complementRangeList
'' v s
145 complementRangeList
' x
[] = [(x
,0x10ffff)]
147 -- | Compute the complement intersected with @(x,)@.
148 complementRangeList
'' :: Int -> [(Int, Int)] -> [(Int, Int)]
149 complementRangeList
'' x s
151 |
otherwise = complementRangeList
' (succ x
) s
153 -- | Compute the complement.
155 -- Note: we treat Ints as codepoints, i.e minBound is 0, and maxBound is 0x10ffff
156 complementRangeList
:: [(Int, Int)] -> [(Int, Int)]
157 complementRangeList s
@((x
,y
):s
')
158 | x
== 0 = complementRangeList
'' y s
'
159 |
otherwise = complementRangeList
' 0 s
160 complementRangeList
[] = [(0, 0x10ffff)]
162 -- | Difference of two 'CharSet's.
163 difference
:: CharSet
-> CharSet
-> CharSet
164 difference xs ys
= intersection xs
(complement ys
)
166 -- | Make 'CharSet' from a list of characters, i.e. 'String'.
167 fromList
:: String -> CharSet
168 fromList
= normalise
. foldl' (\ acc c
-> IM
.insert (ord c
) (ord c
) acc
) IM
.empty
170 -- | Convert 'CharSet' to a list of characters i.e. 'String'.
171 toList
:: CharSet
-> String
172 toList
= concatMap (uncurry enumFromTo) . toIntervalList
174 -- | Convert to interval list
176 -- >>> toIntervalList $ union "01234" "56789"
179 toIntervalList
:: CharSet
-> [(Char, Char)]
180 toIntervalList
(CS m
) = [ (chr lo
, chr hi
) |
(lo
, hi
) <- IM
.toList m
]
182 -- | Convert from interval pairs.
184 -- >>> fromIntervalList []
187 -- >>> fromIntervalList [('a','f'), ('0','9')]
188 -- "0123456789abcdef"
190 -- >>> fromIntervalList [('Z','A')]
193 fromIntervalList
:: [(Char,Char)] -> CharSet
194 fromIntervalList xs
= normalise
' $ sortBy (\a b
-> compare (fst a
) (fst b
))
200 -------------------------------------------------------------------------------
202 -------------------------------------------------------------------------------
204 normalise
:: IM
.IntMap
Int -> CharSet
205 normalise
= normalise
'. IM
.toList
207 normalise
' :: [(Int,Int)] -> CharSet
208 normalise
' = CS
. IM
.fromList
. go
where
209 go
:: [(Int,Int)] -> [(Int,Int)]
211 go
((x
,y
):zs
) = go
' x y zs
213 go
' :: Int -> Int -> [(Int, Int)] -> [(Int, Int)]
214 go
' lo hi
[] = [(lo
, hi
)]
215 go
' lo hi ws0
@((u
,v
):ws
)
216 | u
<= succ hi
= go
' lo
(max v hi
) ws
217 |
otherwise = (lo
,hi
) : go ws0
219 -------------------------------------------------------------------------------
220 -- Alpha Numeric character list
221 -------------------------------------------------------------------------------
223 -- Computing this takes some time,
224 -- but they are not used in-non testing in Cabal's normal operation.
226 -- | Note: this set varies depending on @base@ version.
229 alpha
= foldl' (flip insert) empty [ c | c
<- [ minBound .. maxBound ], isAlpha c
]
230 {-# NOINLINE alpha #-}
232 -- | Note: this set varies depending on @base@ version.
235 alphanum
= foldl' (flip insert) empty [ c | c
<- [ minBound .. maxBound ], isAlphaNum c
]
236 {-# NOINLINE alphanum #-}
238 -- | Note: this set varies depending on @base@ version.
241 upper
= foldl' (flip insert) empty [ c | c
<- [ minBound .. maxBound ], isUpper c
]
242 {-# NOINLINE upper #-}