2 * @brief URL decoding as described by RFC3986.
4 /* Copyright (C) 2011,2012,2015,2022 Olly Betts
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #ifndef OMEGA_INCLUDED_URLDECODE_H
26 #define OMEGA_INCLUDED_URLDECODE_H
32 #include "stringutils.h"
34 struct CGIParameterHandler
{
35 void operator()(const std::string
&, const std::string
&) const;
40 url_decode(const CGIParameterHandler
& handle_parameter
, I begin
, I end
)
42 bool seen_equals
= false;
44 while (begin
!= end
) {
45 unsigned char ch
= *begin
;
52 handle_parameter(var
, val
);
63 unsigned char hex1
= *begin
;
65 if (begin
== end
|| !C_isxdigit(hex1
)) {
72 unsigned char hex2
= *begin
;
74 if (!C_isxdigit(hex2
)) {
82 ch
= hex_decode(hex1
, hex2
);
100 handle_parameter(var
, val
);
104 const char* p
= NULL
;
106 void operator++(int);
111 explicit CStringItor(const char * p_
) : p(p_
) {
115 unsigned char operator*() const { return *p
; }
117 CStringItor
& operator++() {
122 friend bool operator==(const CStringItor
& a
, const CStringItor
& b
);
123 friend bool operator!=(const CStringItor
& a
, const CStringItor
& b
);
127 operator==(const CStringItor
& a
, const CStringItor
& b
)
133 operator!=(const CStringItor
& a
, const CStringItor
& b
)
141 mutable int current
= EOF
;
143 void operator++(int);
148 explicit StdinItor(size_t count_
) : count(count_
), current(256) { }
150 unsigned char operator*() const {
152 current
= std::getchar();
156 StdinItor
& operator++() {
158 current
= std::getchar();
164 friend bool operator==(const StdinItor
& a
, const StdinItor
& b
);
165 friend bool operator!=(const StdinItor
& a
, const StdinItor
& b
);
169 operator==(const StdinItor
& a
, const StdinItor
& b
)
171 return a
.current
== b
.current
;
175 operator!=(const StdinItor
& a
, const StdinItor
& b
)
180 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
181 // to decode if they occur after the "authority".
183 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
184 // actually safe to decode in practice) and &+= (which are OK to decode if they
185 // aren't in the "query" part).
187 // We also need to leave an encoded "%" alone. We should probably leave an
188 // encoded "/" alone too (though we shouldn't encounter one in a database
189 // created by omindex, unless it was in the base URL specified by the user).
191 // This prettifying is aimed at URLs produced by omindex, so we don't currently
192 // try to decode the query or fragment parts of the URL at all. We can probably
193 // safely decode the query in a similar way, but also leaving &+= alone.
200 // Always safe (and 8, 9, a, b, A or B).
204 // Start of a 2 byte UTF-8 sequence.
206 // Start of a 3 byte UTF-8 sequence.
208 // Start of a 4 byte UTF-8 sequence.
212 static const char url_chars
[256] = {
214 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
216 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
218 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
220 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
222 OK
, OK
, OK
, UNSAFE
, OK
, UNSAFE
, OK
, OK
,
224 OK
, OK
, OK
, OK
, OK
, OK
, OK
, UNSAFE
,
226 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
228 OK89AB
, OK89AB
, INPATH
, OK
, OK
, OK
, OK
, UNSAFE
,
230 INPATH
, OK89AB
, OK89AB
, OK
, OK
, OK
, OK
, OK
,
232 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
234 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
236 OK
, OK
, OK
, INPATH
, OK
, INPATH
, OK
, OK
,
238 OK
, OK89AB
, OK89AB
, OK
, OK
, OK
, OK
, OK
,
240 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
242 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
243 // x y z { | } ~ 0x7f
244 OK
, OK
, OK
, OK
, OK
, OK
, OK
, UNSAFE
,
245 // 0x80 0x81 0x82 0x83 0x84 0x85 0x86 0x87
246 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
247 // 0x88 0x89 0x8a 0x8b 0x8c 0x8d 0x8e 0x8f
248 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
249 // 0x90 0x91 0x92 0x93 0x94 0x95 0x96 0x97
250 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
251 // 0x98 0x99 0x9a 0x9b 0x9c 0x9d 0x9e 0x9f
252 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
253 // 0xa0 0xa1 0xa2 0xa3 0xa4 0xa5 0xa6 0xa7
254 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
255 // 0xa8 0xa9 0xaa 0xab 0xac 0xad 0xae 0xaf
256 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
257 // 0xb0 0xb1 0xb2 0xb3 0xb4 0xb5 0xb6 0xb7
258 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
259 // 0xb8 0xb9 0xba 0xbb 0xbc 0xbd 0xbe 0xbf
260 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
261 // 0xc0 0xc1 0xc2 0xc3 0xc4 0xc5 0xc6 0xc7
262 UNSAFE
, UNSAFE
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
263 // 0xc8 0xc9 0xca 0xcb 0xcc 0xcd 0xce 0xcf
264 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
265 // 0xd0 0xd1 0xd2 0xd3 0xd4 0xd5 0xd6 0xd7
266 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
267 // 0xd8 0xd9 0xda 0xdb 0xdc 0xdd 0xde 0xdf
268 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
269 // 0xe0 0xe1 0xe2 0xe3 0xe4 0xe5 0xe6 0xe7
270 SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
,
271 // 0xe8 0xe9 0xea 0xeb 0xec 0xed 0xee 0xef
272 SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
,
273 // 0xf0 0xf1 0xf2 0xf3 0xf4 0xf5 0xf6 0xf7
274 SEQ4
, SEQ4
, SEQ4
, SEQ4
, SEQ4
, UNSAFE
, UNSAFE
, UNSAFE
,
275 // 0xf8 0xf9 0xfa 0xfb 0xfc 0xfd 0xfe 0xff
276 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
279 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
282 encoded_ucont(const std::string
& s
, size_t i
)
284 return s
[i
] == '%' &&
285 url_chars
[static_cast<unsigned char>(s
[i
+ 1])] == OK89AB
&&
286 C_isxdigit(s
[i
+ 2]);
291 * Undo RFC3986 escaping which doesn't affect semantics in practice, to make
292 * a prettier version of a URL to show the user, but which should still work
293 * if copied and pasted.
296 url_prettify(std::string
& url
)
298 size_t pcent
= url
.find('%');
299 // Fast path for URLs without a '%' in.
300 if (pcent
== std::string::npos
)
306 // Don't try to decode the query or fragment, and don't try to decode if
307 // there aren't 2 characters after the '%'.
308 size_t pretty_limit
= std::min(url
.find_first_of("?#"), url
.size() - 2);
309 if (pcent
>= pretty_limit
)
312 size_t slash
= std::string::npos
;
316 url
.reserve(in
.size());
318 // We've checked there are at least two bytes after the '%' already.
319 if (C_isxdigit(in
[pcent
+ 1]) && C_isxdigit(in
[pcent
+ 2])) {
320 unsigned char ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
322 switch (url_chars
[ch
]) {
327 if (in
.size() - (pcent
+ 2) < 3 ||
328 !encoded_ucont(in
, pcent
+ 3)) {
332 url
.append(in
, start
, pcent
- start
);
335 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
339 if (in
.size() - (pcent
+ 2) < 3 * 2 ||
340 !encoded_ucont(in
, pcent
+ 3) ||
341 !encoded_ucont(in
, pcent
+ 6) ||
342 (ch
== 0xe0 && in
[pcent
+ 4] <= '9')) {
346 url
.append(in
, start
, pcent
- start
);
349 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
352 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
356 if (in
.size() - (pcent
+ 2) < 3 * 3 ||
357 !encoded_ucont(in
, pcent
+ 3) ||
358 !encoded_ucont(in
, pcent
+ 6) ||
359 !encoded_ucont(in
, pcent
+ 9) ||
360 (ch
== 0xf0 && in
[pcent
+ 4] == '8') ||
361 (ch
== 0xf4 && in
[pcent
+ 4] >= '9')) {
365 url
.append(in
, start
, pcent
- start
);
368 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
371 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
374 ch
= hex_decode(in
[pcent
+ 1], in
[pcent
+ 2]);
378 // ':' is safe to decode if there is a single '/' earlier in
380 if (slash
== std::string::npos
) {
381 // Lazily set slash to the position of the first single '/'.
382 const char * d
= in
.data();
385 const void* s
= std::memchr(d
+ slash
, '/',
386 pretty_limit
- slash
);
391 slash
= reinterpret_cast<const char *>(s
) - d
;
392 if (slash
== in
.size() - 1 || d
[slash
+ 1] != '/')
395 while (++slash
< in
.size() - 1 && d
[slash
] == '/') { }
398 safe
= (pcent
> slash
);
403 url
.append(in
, start
, pcent
- start
);
413 pcent
= in
.find('%', pcent
);
415 if (pcent
>= pretty_limit
) {
416 url
.append(in
, start
, std::string::npos
);
422 #endif // OMEGA_INCLUDED_URLDECODE_H