bootstrap with newer autotools
[xapian.git] / xapian-applications / omega / urldecode.h
blobc51fcab3c44f798b06ed4ac10744d11e9dc412c5
1 /** @file
2 * @brief URL decoding as described by RFC3986.
3 */
4 /* Copyright (C) 2011,2012,2015,2022 Olly Betts
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
25 #ifndef OMEGA_INCLUDED_URLDECODE_H
26 #define OMEGA_INCLUDED_URLDECODE_H
28 #include <algorithm>
29 #include <cstdio>
30 #include <cstring>
31 #include <string>
32 #include "stringutils.h"
34 struct CGIParameterHandler {
35 void operator()(const std::string&, const std::string&) const;
38 template<typename I>
39 inline void
40 url_decode(const CGIParameterHandler & handle_parameter, I begin, I end)
42 bool seen_equals = false;
43 std::string var, val;
44 while (begin != end) {
45 unsigned char ch = *begin;
46 ++begin;
47 process_ch:
48 if (ch == '&') {
49 if (!seen_equals)
50 swap(var, val);
51 if (!var.empty())
52 handle_parameter(var, val);
53 var.resize(0);
54 val.resize(0);
55 seen_equals = false;
56 continue;
59 switch (ch) {
60 case '%': {
61 if (begin == end)
62 break;
63 unsigned char hex1 = *begin;
64 ++begin;
65 if (begin == end || !C_isxdigit(hex1)) {
66 val += ch;
67 ch = hex1;
68 if (begin == end)
69 break;
70 goto process_ch;
72 unsigned char hex2 = *begin;
73 ++begin;
74 if (!C_isxdigit(hex2)) {
75 val += ch;
76 val += hex1;
77 ch = hex2;
78 if (begin == end)
79 break;
80 goto process_ch;
82 ch = hex_decode(hex1, hex2);
83 break;
85 case '+':
86 ch = ' ';
87 break;
88 case '=':
89 if (seen_equals)
90 break;
91 seen_equals = true;
92 swap(var, val);
93 continue;
95 val += ch;
97 if (!seen_equals)
98 swap(var, val);
99 if (!var.empty())
100 handle_parameter(var, val);
103 class CStringItor {
104 const char* p = NULL;
106 void operator++(int);
108 public:
109 CStringItor() { }
111 explicit CStringItor(const char * p_) : p(p_) {
112 if (!*p) p = NULL;
115 unsigned char operator*() const { return *p; }
117 CStringItor & operator++() {
118 if (!*++p) p = NULL;
119 return *this;
122 friend bool operator==(const CStringItor& a, const CStringItor& b);
123 friend bool operator!=(const CStringItor& a, const CStringItor& b);
126 inline bool
127 operator==(const CStringItor& a, const CStringItor& b)
129 return a.p == b.p;
132 inline bool
133 operator!=(const CStringItor& a, const CStringItor& b)
135 return !(a == b);
138 class StdinItor {
139 size_t count;
141 mutable int current = EOF;
143 void operator++(int);
145 public:
146 StdinItor() { }
148 explicit StdinItor(size_t count_) : count(count_), current(256) { }
150 unsigned char operator*() const {
151 if (current == 256)
152 current = std::getchar();
153 return current;
156 StdinItor & operator++() {
157 if (count--)
158 current = std::getchar();
159 else
160 current = EOF;
161 return *this;
164 friend bool operator==(const StdinItor& a, const StdinItor& b);
165 friend bool operator!=(const StdinItor& a, const StdinItor& b);
168 inline bool
169 operator==(const StdinItor& a, const StdinItor& b)
171 return a.current == b.current;
174 inline bool
175 operator!=(const StdinItor& a, const StdinItor& b)
177 return !(a == b);
180 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
181 // to decode if they occur after the "authority".
183 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
184 // actually safe to decode in practice) and &+= (which are OK to decode if they
185 // aren't in the "query" part).
187 // We also need to leave an encoded "%" alone. We should probably leave an
188 // encoded "/" alone too (though we shouldn't encounter one in a database
189 // created by omindex, unless it was in the base URL specified by the user).
191 // This prettifying is aimed at URLs produced by omindex, so we don't currently
192 // try to decode the query or fragment parts of the URL at all. We can probably
193 // safely decode the query in a similar way, but also leaving &+= alone.
195 enum {
196 // Always unsafe.
197 UNSAFE,
198 // Always safe.
200 // Always safe (and 8, 9, a, b, A or B).
201 OK89AB,
202 // Safe after a '/'.
203 INPATH,
204 // Start of a 2 byte UTF-8 sequence.
205 SEQ2,
206 // Start of a 3 byte UTF-8 sequence.
207 SEQ3,
208 // Start of a 4 byte UTF-8 sequence.
209 SEQ4
212 static const char url_chars[256] = {
213 // 0x00-0x07
214 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
215 // 0x08-0x0f
216 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
217 // 0x10-0x17
218 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
219 // 0x18-0x1f
220 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
221 // ' ' ! " # $ % & '
222 OK, OK, OK, UNSAFE, OK, UNSAFE, OK, OK,
223 // ( ) * + , - . /
224 OK, OK, OK, OK, OK, OK, OK, UNSAFE,
225 // 0 1 2 3 4 5 6 7
226 OK, OK, OK, OK, OK, OK, OK, OK,
227 // 8 9 : ; < = > ?
228 OK89AB, OK89AB, INPATH, OK, OK, OK, OK, UNSAFE,
229 // @ A B C D E F G
230 INPATH, OK89AB, OK89AB, OK, OK, OK, OK, OK,
231 // H I J K L M N O
232 OK, OK, OK, OK, OK, OK, OK, OK,
233 // P Q R S T U V W
234 OK, OK, OK, OK, OK, OK, OK, OK,
235 // X Y Z [ \ ] ^ _
236 OK, OK, OK, INPATH, OK, INPATH, OK, OK,
237 // ` a b c d e f g
238 OK, OK89AB, OK89AB, OK, OK, OK, OK, OK,
239 // h i j k l m n o
240 OK, OK, OK, OK, OK, OK, OK, OK,
241 // p q r s t u v w
242 OK, OK, OK, OK, OK, OK, OK, OK,
243 // x y z { | } ~ 0x7f
244 OK, OK, OK, OK, OK, OK, OK, UNSAFE,
245 // 0x80 0x81 0x82 0x83 0x84 0x85 0x86 0x87
246 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
247 // 0x88 0x89 0x8a 0x8b 0x8c 0x8d 0x8e 0x8f
248 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
249 // 0x90 0x91 0x92 0x93 0x94 0x95 0x96 0x97
250 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
251 // 0x98 0x99 0x9a 0x9b 0x9c 0x9d 0x9e 0x9f
252 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
253 // 0xa0 0xa1 0xa2 0xa3 0xa4 0xa5 0xa6 0xa7
254 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
255 // 0xa8 0xa9 0xaa 0xab 0xac 0xad 0xae 0xaf
256 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
257 // 0xb0 0xb1 0xb2 0xb3 0xb4 0xb5 0xb6 0xb7
258 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
259 // 0xb8 0xb9 0xba 0xbb 0xbc 0xbd 0xbe 0xbf
260 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
261 // 0xc0 0xc1 0xc2 0xc3 0xc4 0xc5 0xc6 0xc7
262 UNSAFE, UNSAFE, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
263 // 0xc8 0xc9 0xca 0xcb 0xcc 0xcd 0xce 0xcf
264 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
265 // 0xd0 0xd1 0xd2 0xd3 0xd4 0xd5 0xd6 0xd7
266 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
267 // 0xd8 0xd9 0xda 0xdb 0xdc 0xdd 0xde 0xdf
268 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
269 // 0xe0 0xe1 0xe2 0xe3 0xe4 0xe5 0xe6 0xe7
270 SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3,
271 // 0xe8 0xe9 0xea 0xeb 0xec 0xed 0xee 0xef
272 SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3,
273 // 0xf0 0xf1 0xf2 0xf3 0xf4 0xf5 0xf6 0xf7
274 SEQ4, SEQ4, SEQ4, SEQ4, SEQ4, UNSAFE, UNSAFE, UNSAFE,
275 // 0xf8 0xf9 0xfa 0xfb 0xfc 0xfd 0xfe 0xff
276 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE
279 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
280 // and a hex digit.
281 inline bool
282 encoded_ucont(const std::string & s, size_t i)
284 return s[i] == '%' &&
285 url_chars[static_cast<unsigned char>(s[i + 1])] == OK89AB &&
286 C_isxdigit(s[i + 2]);
289 /** Prettify a URL.
291 * Undo RFC3986 escaping which doesn't affect semantics in practice, to make
292 * a prettier version of a URL to show the user, but which should still work
293 * if copied and pasted.
295 inline void
296 url_prettify(std::string & url)
298 size_t pcent = url.find('%');
299 // Fast path for URLs without a '%' in.
300 if (pcent == std::string::npos)
301 return;
303 if (url.size() < 3)
304 return;
306 // Don't try to decode the query or fragment, and don't try to decode if
307 // there aren't 2 characters after the '%'.
308 size_t pretty_limit = std::min(url.find_first_of("?#"), url.size() - 2);
309 if (pcent >= pretty_limit)
310 return;
312 size_t slash = std::string::npos;
313 size_t start = 0;
314 std::string in;
315 swap(in, url);
316 url.reserve(in.size());
317 while (true) {
318 // We've checked there are at least two bytes after the '%' already.
319 if (C_isxdigit(in[pcent + 1]) && C_isxdigit(in[pcent + 2])) {
320 unsigned char ch = hex_decode(in[pcent + 1], in[pcent + 2]);
321 bool safe = true;
322 switch (url_chars[ch]) {
323 case UNSAFE:
324 safe = false;
325 break;
326 case SEQ2:
327 if (in.size() - (pcent + 2) < 3 ||
328 !encoded_ucont(in, pcent + 3)) {
329 safe = false;
330 break;
332 url.append(in, start, pcent - start);
333 url += char(ch);
334 pcent += 3;
335 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
336 start = pcent;
337 break;
338 case SEQ3:
339 if (in.size() - (pcent + 2) < 3 * 2 ||
340 !encoded_ucont(in, pcent + 3) ||
341 !encoded_ucont(in, pcent + 6) ||
342 (ch == 0xe0 && in[pcent + 4] <= '9')) {
343 safe = false;
344 break;
346 url.append(in, start, pcent - start);
347 url += char(ch);
348 pcent += 3;
349 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
350 url += char(ch);
351 pcent += 3;
352 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
353 start = pcent;
354 break;
355 case SEQ4:
356 if (in.size() - (pcent + 2) < 3 * 3 ||
357 !encoded_ucont(in, pcent + 3) ||
358 !encoded_ucont(in, pcent + 6) ||
359 !encoded_ucont(in, pcent + 9) ||
360 (ch == 0xf0 && in[pcent + 4] == '8') ||
361 (ch == 0xf4 && in[pcent + 4] >= '9')) {
362 safe = false;
363 break;
365 url.append(in, start, pcent - start);
366 url += char(ch);
367 pcent += 3;
368 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
369 url += char(ch);
370 pcent += 3;
371 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
372 url += char(ch);
373 pcent += 3;
374 ch = hex_decode(in[pcent + 1], in[pcent + 2]);
375 start = pcent;
376 break;
377 case INPATH:
378 // ':' is safe to decode if there is a single '/' earlier in
379 // the URL.
380 if (slash == std::string::npos) {
381 // Lazily set slash to the position of the first single '/'.
382 const char * d = in.data();
383 slash = 0;
384 while (true) {
385 const void* s = std::memchr(d + slash, '/',
386 pretty_limit - slash);
387 if (s == NULL) {
388 slash = in.size();
389 break;
391 slash = reinterpret_cast<const char *>(s) - d;
392 if (slash == in.size() - 1 || d[slash + 1] != '/')
393 break;
394 ++slash;
395 while (++slash < in.size() - 1 && d[slash] == '/') { }
398 safe = (pcent > slash);
399 break;
402 if (safe) {
403 url.append(in, start, pcent - start);
404 url += char(ch);
405 pcent += 3;
406 start = pcent;
407 } else {
408 pcent += 3;
410 } else {
411 ++pcent;
413 pcent = in.find('%', pcent);
415 if (pcent >= pretty_limit) {
416 url.append(in, start, std::string::npos);
417 return;
422 #endif // OMEGA_INCLUDED_URLDECODE_H