2 * @brief URL decoding as described by RFC3986.
4 /* Copyright (C) 2011,2012,2015 Olly Betts
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #ifndef OMEGA_INCLUDED_URLDECODE_H
26 #define OMEGA_INCLUDED_URLDECODE_H
32 #include "stringutils.h"
34 struct CGIParameterHandler
{
35 void operator()(const std::string
&, const std::string
&) const;
40 url_decode(const CGIParameterHandler
& handle_parameter
, I begin
, I end
)
42 bool seen_equals
= false;
44 while (begin
!= end
) {
45 unsigned char ch
= *begin
;
52 handle_parameter(var
, val
);
63 unsigned char hex1
= *begin
;
65 if (begin
== end
|| !C_isxdigit(hex1
)) {
72 unsigned char newch
= hex_digit(hex1
);
73 unsigned char hex2
= *begin
;
75 if (!C_isxdigit(hex2
)) {
83 ch
= (newch
<< 4) | hex_digit(hex2
);
101 handle_parameter(var
, val
);
107 void operator++(int);
110 CStringItor() : p(NULL
) { }
112 explicit CStringItor(const char * p_
) : p(p_
) {
116 unsigned char operator*() const { return *p
; }
118 CStringItor
& operator++() {
123 friend bool operator==(const CStringItor
& a
, const CStringItor
& b
);
124 friend bool operator!=(const CStringItor
& a
, const CStringItor
& b
);
128 operator==(const CStringItor
& a
, const CStringItor
& b
)
134 operator!=(const CStringItor
& a
, const CStringItor
& b
)
144 void operator++(int);
147 StdinItor() : current(EOF
) { }
149 explicit StdinItor(size_t count_
) : count(count_
), current(256) { }
151 unsigned char operator*() const {
153 current
= std::getchar();
157 StdinItor
& operator++() {
159 current
= std::getchar();
165 friend bool operator==(const StdinItor
& a
, const StdinItor
& b
);
166 friend bool operator!=(const StdinItor
& a
, const StdinItor
& b
);
170 operator==(const StdinItor
& a
, const StdinItor
& b
)
172 return a
.current
== b
.current
;
176 operator!=(const StdinItor
& a
, const StdinItor
& b
)
181 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
182 // to decode if they occur after the "authority".
184 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
185 // actually safe to decode in practice) and &+= (which are OK to decode if they
186 // aren't in the "query" part).
188 // We also need to leave an encoded "%" alone. We should probably leave an
189 // encoded "/" alone too (though we shouldn't encounter one in a database
190 // created by omindex, unless it was in the base URL specified by the user).
192 // This prettifying is aimed at URLs produced by omindex, so we don't currently
193 // try to decode the query or fragment parts of the URL at all. We can probably
194 // safely decode the query in a similar way, but also leaving &+= alone.
201 // Always safe (and 8, 9, a, b, A or B).
205 // Start of a 2 byte UTF-8 sequence.
207 // Start of a 3 byte UTF-8 sequence.
209 // Start of a 4 byte UTF-8 sequence.
213 static const char url_chars
[256] = {
215 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
217 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
219 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
221 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
223 OK
, OK
, OK
, UNSAFE
, OK
, UNSAFE
, OK
, OK
,
225 OK
, OK
, OK
, OK
, OK
, OK
, OK
, UNSAFE
,
227 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
229 OK89AB
, OK89AB
, INPATH
, OK
, OK
, OK
, OK
, UNSAFE
,
231 INPATH
, OK89AB
, OK89AB
, OK
, OK
, OK
, OK
, OK
,
233 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
235 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
237 OK
, OK
, OK
, INPATH
, OK
, INPATH
, OK
, OK
,
239 OK
, OK89AB
, OK89AB
, OK
, OK
, OK
, OK
, OK
,
241 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
243 OK
, OK
, OK
, OK
, OK
, OK
, OK
, OK
,
244 // x y z { | } ~ 0x7f
245 OK
, OK
, OK
, OK
, OK
, OK
, OK
, UNSAFE
,
246 // 0x80 0x81 0x82 0x83 0x84 0x85 0x86 0x87
247 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
248 // 0x88 0x89 0x8a 0x8b 0x8c 0x8d 0x8e 0x8f
249 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
250 // 0x90 0x91 0x92 0x93 0x94 0x95 0x96 0x97
251 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
252 // 0x98 0x99 0x9a 0x9b 0x9c 0x9d 0x9e 0x9f
253 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
254 // 0xa0 0xa1 0xa2 0xa3 0xa4 0xa5 0xa6 0xa7
255 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
256 // 0xa8 0xa9 0xaa 0xab 0xac 0xad 0xae 0xaf
257 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
258 // 0xb0 0xb1 0xb2 0xb3 0xb4 0xb5 0xb6 0xb7
259 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
260 // 0xb8 0xb9 0xba 0xbb 0xbc 0xbd 0xbe 0xbf
261 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
,
262 // 0xc0 0xc1 0xc2 0xc3 0xc4 0xc5 0xc6 0xc7
263 UNSAFE
, UNSAFE
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
264 // 0xc8 0xc9 0xca 0xcb 0xcc 0xcd 0xce 0xcf
265 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
266 // 0xd0 0xd1 0xd2 0xd3 0xd4 0xd5 0xd6 0xd7
267 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
268 // 0xd8 0xd9 0xda 0xdb 0xdc 0xdd 0xde 0xdf
269 SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
, SEQ2
,
270 // 0xe0 0xe1 0xe2 0xe3 0xe4 0xe5 0xe6 0xe7
271 SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
,
272 // 0xe8 0xe9 0xea 0xeb 0xec 0xed 0xee 0xef
273 SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
, SEQ3
,
274 // 0xf0 0xf1 0xf2 0xf3 0xf4 0xf5 0xf6 0xf7
275 SEQ4
, SEQ4
, SEQ4
, SEQ4
, SEQ4
, UNSAFE
, UNSAFE
, UNSAFE
,
276 // 0xf8 0xf9 0xfa 0xfb 0xfc 0xfd 0xfe 0xff
277 UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
, UNSAFE
280 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
283 encoded_ucont(const std::string
& s
, size_t i
)
285 return s
[i
] == '%' &&
286 url_chars
[static_cast<unsigned char>(s
[i
+ 1])] == OK89AB
&&
287 C_isxdigit(s
[i
+ 2]);
292 * Undo RFC3986 escaping which doesn't affect semantics in practice, to make
293 * a prettier version of a URL to show the user, but which should still work
294 * if copied and pasted.
297 url_prettify(std::string
& url
)
299 size_t pcent
= url
.find('%');
300 // Fast path for URLs without a '%' in.
301 if (pcent
== std::string::npos
)
307 // Don't try to decode the query or fragment, and don't try to decode if
308 // there aren't 2 characters after the '%'.
309 size_t pretty_limit
= std::min(url
.find_first_of("?#"), url
.size() - 2);
310 if (pcent
>= pretty_limit
)
313 size_t slash
= std::string::npos
;
317 url
.reserve(in
.size());
319 // We've checked there are at least two bytes after the '%' already.
320 if (C_isxdigit(in
[pcent
+ 1]) && C_isxdigit(in
[pcent
+ 2])) {
321 int ch
= (hex_digit(in
[pcent
+ 1]) << 4);
322 ch
|= hex_digit(in
[pcent
+ 2]);
324 switch (url_chars
[ch
]) {
329 if (in
.size() - (pcent
+ 2) < 3 ||
330 !encoded_ucont(in
, pcent
+ 3)) {
334 url
.append(in
, start
, pcent
- start
);
337 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
338 ch
|= hex_digit(in
[pcent
+ 2]);
342 if (in
.size() - (pcent
+ 2) < 3 * 2 ||
343 !encoded_ucont(in
, pcent
+ 3) ||
344 !encoded_ucont(in
, pcent
+ 6) ||
345 (ch
== 0xe0 && in
[pcent
+ 4] <= '9')) {
349 url
.append(in
, start
, pcent
- start
);
352 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
353 ch
|= hex_digit(in
[pcent
+ 2]);
356 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
357 ch
|= hex_digit(in
[pcent
+ 2]);
361 if (in
.size() - (pcent
+ 2) < 3 * 3 ||
362 !encoded_ucont(in
, pcent
+ 3) ||
363 !encoded_ucont(in
, pcent
+ 6) ||
364 !encoded_ucont(in
, pcent
+ 9) ||
365 (ch
== 0xf0 && in
[pcent
+ 4] == '8') ||
366 (ch
== 0xf4 && in
[pcent
+ 4] >= '9')) {
370 url
.append(in
, start
, pcent
- start
);
373 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
374 ch
|= hex_digit(in
[pcent
+ 2]);
377 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
378 ch
|= hex_digit(in
[pcent
+ 2]);
381 ch
= (hex_digit(in
[pcent
+ 1]) << 4);
382 ch
|= hex_digit(in
[pcent
+ 2]);
386 // ':' is safe to decode if there is a single '/' earlier in
388 if (slash
== std::string::npos
) {
389 // Lazily set slash to the position of the first single '/'.
390 const char * d
= in
.data();
393 const void* s
= std::memchr(d
+ slash
, '/',
394 pretty_limit
- slash
);
399 slash
= reinterpret_cast<const char *>(s
) - d
;
400 if (slash
== in
.size() - 1 || d
[slash
+ 1] != '/')
403 while (++slash
< in
.size() - 1 && d
[slash
] == '/') { }
406 safe
= (pcent
> slash
);
411 url
.append(in
, start
, pcent
- start
);
421 pcent
= in
.find('%', pcent
);
423 if (pcent
>= pretty_limit
) {
424 url
.append(in
, start
, std::string::npos
);
430 #endif // OMEGA_INCLUDED_URLDECODE_H