strip non-esc control chars
[diohsc.git] / URI.hs
blob03d4a386c6f68e28457d48965032d67eccd34960
1 -- This file is part of Diohsc
2 -- Copyright (C) 2020 Martin Bays <mbays@sdf.org>
3 --
4 -- This program is free software: you can redistribute it and/or modify
5 -- it under the terms of version 3 of the GNU General Public License as
6 -- published by the Free Software Foundation, or any later version.
7 --
8 -- You should have received a copy of the GNU General Public License
9 -- along with this program. If not, see http://www.gnu.org/licenses/.
11 {-# LANGUAGE OverloadedStrings #-}
12 {-# LANGUAGE Safe #-}
14 module URI
15 ( URI
16 , URIRef
17 , escapeIRI
18 , escapePathString
19 , escapeQuery
20 , escapeQueryPart
21 , nullUri
22 , parseAbsoluteUri
23 , parseUriAsAbsolute
24 , parseUriReference
25 , pathSegments
26 , relativeFrom
27 , relativeTo
28 , setQuery
29 , setSchemeDefault
30 , stripUri
31 , stripUriForGemini
32 , unescapeUriString
33 , uriFragment
34 , uriPath
35 , uriPort
36 , uriRegName
37 , uriScheme
38 , uriQuery
39 ) where
41 import Control.Monad (mplus, (<=<))
42 import Data.Char (toLower)
43 import Data.List (dropWhileEnd)
44 import Data.Maybe (isNothing)
45 import Safe (readMay)
47 import qualified Network.URI as NU
49 defaultScheme :: String
50 defaultScheme = "gemini"
52 defaultPort :: Int
53 defaultPort = 1965
55 -- | Represents a normalised absolute URI with scheme and port defaults as above.
56 -- We use "Uri" rather than "URI" in camelcase,
57 -- because I prefer to think of it as a word rather than an acronym.
58 -- Still use "URI" if it's the first/only word of the identifier.
59 newtype URI = URI {uriUri :: NU.URI}
60 deriving (Eq,Ord)
61 instance Show URI where
62 show (URI uri) = show uri
64 uriPath, uriQuery, uriFragment :: URI -> String
65 uriPath = NU.uriPath . uriUri
66 uriQuery = NU.uriQuery . uriUri
67 uriFragment = NU.uriFragment . uriUri
69 -- | strips trailing ':'
70 uriScheme :: URI -> String
71 uriScheme = init . NU.uriScheme . uriUri
73 setSchemeDefault :: URI -> URI
74 setSchemeDefault = URI . (\nuri -> nuri { NU.uriScheme = defaultScheme <> ":" }) . uriUri
76 pathSegments :: URI -> [String]
77 pathSegments (URI uri) = NU.pathSegments uri
79 nullUri :: URI
80 nullUri = URI NU.nullURI
82 -- | URI reference. May be absolute. Not normalised.
83 newtype URIRef = URIRef NU.URI
84 deriving (Eq,Ord)
85 instance Show URIRef where
86 show (URIRef uri) = show uri
88 normaliseUri :: NU.URI -> URI
89 normaliseUri uri = URI $ uri
90 { NU.uriPath = (\p -> if null p then "/" else p) .
91 NU.normalizePathSegments . NU.normalizeEscape $ NU.uriPath uri
92 , NU.uriScheme = if null $ NU.uriScheme uri
93 then defaultScheme else toLower <$> NU.uriScheme uri
94 , NU.uriAuthority = (\auth -> auth
95 { NU.uriPort =
96 if NU.uriPort auth == ':' : show defaultPort
97 then "" else NU.uriPort auth
98 , NU.uriRegName = toLower <$> NU.uriRegName auth
100 <$> NU.uriAuthority uri
101 , NU.uriQuery = NU.normalizeEscape $ NU.uriQuery uri
104 stripUriForGemini :: URI -> URI
105 stripUriForGemini (URI uri) = URI $ uri
106 { NU.uriAuthority = (\auth -> auth {NU.uriUserInfo = ""}) <$> NU.uriAuthority uri
107 , NU.uriFragment = ""
110 parseAbsoluteUri :: String -> Maybe URI
111 parseAbsoluteUri = (normaliseUri <$>) . NU.parseURI
113 parseUriAsAbsolute :: String -> Maybe URI
114 parseUriAsAbsolute s = parseAbsoluteUri s `mplus` parseAbsoluteUri (defaultScheme ++ "://" ++ s)
116 parseUriReference :: String -> Maybe URIRef
117 parseUriReference = (URIRef <$>) . NU.parseURIReference
120 setQuery :: String -> URI -> URI
121 setQuery q (URI uri) = URI $ uri { NU.uriQuery = q }
123 stripUri :: URI -> URI
124 stripUri (URI uri) = URI $ uri { NU.uriPath = dropWhileEnd (== '/') $ NU.uriPath uri, NU.uriQuery = "" }
126 relativeTo :: URIRef -> URI -> URI
127 relativeTo (URIRef ref) (URI uri) = normaliseUri $ NU.relativeTo ref uri
129 -- | lift NU.relativeFrom, but set scheme when the result is absolute,
130 -- and avoid initial slash where possible, and prefer "." to "" and ".." to "../"
131 relativeFrom :: URI -> URI -> URIRef
132 relativeFrom (URI uri1) (URI uri2) =
133 URIRef . fixDots . stripSlash . setScheme $ NU.relativeFrom uri1 uri2 where
134 setScheme ref | isNothing (NU.uriAuthority ref) = ref
135 | otherwise = ref { NU.uriScheme = NU.uriScheme uri1 }
136 stripSlash ref | '/':path' <- NU.uriPath ref
137 , not $ null path'
138 , ref' <- ref { NU.uriPath = path' }
139 , NU.relativeTo ref' uri2 == uri1 = ref'
140 | otherwise = ref
141 fixDots ref = case NU.uriPath ref of
142 "" | ref' <- ref { NU.uriPath = "." }
143 , NU.relativeTo ref' uri2 == uri1 -> ref'
144 "../" -> ref { NU.uriPath = ".." }
145 _ -> ref
147 uriRegName :: URI -> Maybe String
148 uriRegName = (NU.uriRegName <$>) . NU.uriAuthority . uriUri
150 uriPort :: URI -> Maybe Int
151 uriPort = (readPort . NU.uriPort) <=< (NU.uriAuthority . uriUri)
152 where
153 readPort (':':n) = readMay n
154 readPort _ = Nothing
156 escapePathString :: String -> String
157 escapePathString = NU.escapeURIString (\c -> NU.isUnreserved c || c == '/')
159 unescapeUriString :: String -> String
160 unescapeUriString = NU.unEscapeString
162 -- | unreserved / sub-delims / ":" / "@" / "/" / "?"
163 isUnescapedInQuery :: Char -> Bool
164 isUnescapedInQuery c = NU.isUnescapedInURI c && c `notElem` ("#[]"::String)
166 escapeQuery :: String -> String
167 escapeQuery = NU.escapeURIString isUnescapedInQuery . withEscapes
168 where
169 withEscapes "" = ""
170 withEscapes ('\\':'x':h1:h2:s) | Just c <- readMay $ "'\\x" <> [h1,h2,'\''] = c:withEscapes s
171 withEscapes ('\\':'e':s) = '\ESC':withEscapes s
172 withEscapes ('\\':'r':s) = '\r':withEscapes s
173 withEscapes ('\\':'n':s) = '\n':withEscapes s
174 withEscapes ('\\':'t':s) = '\t':withEscapes s
175 withEscapes ('\\':c:s) = c:withEscapes s
176 withEscapes (c:s) = c:withEscapes s
178 -- |escape the query part of an unparsed uri string
179 escapeQueryPart :: String -> String
180 escapeQueryPart s
181 | (s','?':q) <- break (== '?') s = s' ++ '?' : escapeQuery q
182 | otherwise = s
184 -- |conversion of IRI to URI according to Step 2 in Section 3.1 in RFC3987
185 -- (for now at least, we apply this also to the regname rather than
186 -- punycoding)
187 escapeIRI :: String -> String
188 escapeIRI = NU.escapeURIString (not . escape)
189 where
190 -- |ucschar or iprivate in RFC3987
191 escape :: Char -> Bool
192 escape c = let i = fromEnum c in
193 i >= 0xA0 && i <= 0xD7FF ||
194 i >= 0xE000 && i <= 0xF8FF ||
195 i >= 0xF900 && i <= 0xFDCF ||
196 i >= 0xFDF0 && i <= 0xFFEF ||
197 i >= 0x10000 && i <= 0x1FFFD ||
198 i >= 0x20000 && i <= 0x2FFFD ||
199 i >= 0x30000 && i <= 0x3FFFD ||
200 i >= 0x40000 && i <= 0x4FFFD ||
201 i >= 0x50000 && i <= 0x5FFFD ||
202 i >= 0x60000 && i <= 0x6FFFD ||
203 i >= 0x70000 && i <= 0x7FFFD ||
204 i >= 0x80000 && i <= 0x8FFFD ||
205 i >= 0x90000 && i <= 0x9FFFD ||
206 i >= 0xA0000 && i <= 0xAFFFD ||
207 i >= 0xB0000 && i <= 0xBFFFD ||
208 i >= 0xC0000 && i <= 0xCFFFD ||
209 i >= 0xD0000 && i <= 0xDFFFD ||
210 i >= 0xE1000 && i <= 0xEFFFD ||
211 i >= 0xF0000 && i <= 0xFFFFD ||
212 i >= 0x100000 && i <= 0x10FFFD