xapian-applications/omega/urldecode.h

   1 /** @file
   2  * @brief URL decoding as described by RFC3986.
   3  */
   4 /* Copyright (C) 2011,2012,2015,2022 Olly Betts
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef OMEGA_INCLUDED_URLDECODE_H
  26 #define OMEGA_INCLUDED_URLDECODE_H
  27
  28 #include <algorithm>
  29 #include <cstdio>
  30 #include <cstring>
  31 #include <string>
  32 #include "stringutils.h"
  33
  34 struct CGIParameterHandler {
  35     void operator()(const std::string&, const std::string&) const;
  36 };
  37
  38 template<typename I>
  39 inline void
  40 url_decode(const CGIParameterHandler & handle_parameter, I begin, I end)
  41 {
  42     bool seen_equals = false;
  43     std::string var, val;
  44     while (begin != end) {
  45         unsigned char ch = *begin;
  46         ++begin;
  47 process_ch:
  48         if (ch == '&') {
  49             if (!seen_equals)
  50                 swap(var, val);
  51             if (!var.empty())
  52                 handle_parameter(var, val);
  53             var.resize(0);
  54             val.resize(0);
  55             seen_equals = false;
  56             continue;
  57         }
  58
  59         switch (ch) {
  60             case '%': {
  61                 if (begin == end)
  62                     break;
  63                 unsigned char hex1 = *begin;
  64                 ++begin;
  65                 if (begin == end || !C_isxdigit(hex1)) {
  66                     val += ch;
  67                     ch = hex1;
  68                     if (begin == end)
  69                         break;
  70                     goto process_ch;
  71                 }
  72                 unsigned char hex2 = *begin;
  73                 ++begin;
  74                 if (!C_isxdigit(hex2)) {
  75                     val += ch;
  76                     val += hex1;
  77                     ch = hex2;
  78                     if (begin == end)
  79                         break;
  80                     goto process_ch;
  81                 }
  82                 ch = hex_decode(hex1, hex2);
  83                 break;
  84             }
  85             case '+':
  86                 ch = ' ';
  87                 break;
  88             case '=':
  89                 if (seen_equals)
  90                     break;
  91                 seen_equals = true;
  92                 swap(var, val);
  93                 continue;
  94         }
  95         val += ch;
  96     }
  97     if (!seen_equals)
  98         swap(var, val);
  99     if (!var.empty())
 100         handle_parameter(var, val);
 101 }
 102
 103 class CStringItor {
 104     const char* p = NULL;
 105
 106     void operator++(int);
 107
 108   public:
 109     CStringItor() { }
 110
 111     explicit CStringItor(const char * p_) : p(p_) {
 112         if (!*p) p = NULL;
 113     }
 114
 115     unsigned char operator*() const { return *p; }
 116
 117     CStringItor & operator++() {
 118         if (!*++p) p = NULL;
 119         return *this;
 120     }
 121
 122     friend bool operator==(const CStringItor& a, const CStringItor& b);
 123     friend bool operator!=(const CStringItor& a, const CStringItor& b);
 124 };
 125
 126 inline bool
 127 operator==(const CStringItor& a, const CStringItor& b)
 128 {
 129     return a.p == b.p;
 130 }
 131
 132 inline bool
 133 operator!=(const CStringItor& a, const CStringItor& b)
 134 {
 135     return !(a == b);
 136 }
 137
 138 class StdinItor {
 139     size_t count;
 140
 141     mutable int current = EOF;
 142
 143     void operator++(int);
 144
 145   public:
 146     StdinItor() { }
 147
 148     explicit StdinItor(size_t count_) : count(count_), current(256) { }
 149
 150     unsigned char operator*() const {
 151         if (current == 256)
 152             current = std::getchar();
 153         return current;
 154     }
 155
 156     StdinItor & operator++() {
 157         if (count--)
 158             current = std::getchar();
 159         else
 160             current = EOF;
 161         return *this;
 162     }
 163
 164     friend bool operator==(const StdinItor& a, const StdinItor& b);
 165     friend bool operator!=(const StdinItor& a, const StdinItor& b);
 166 };
 167
 168 inline bool
 169 operator==(const StdinItor& a, const StdinItor& b)
 170 {
 171     return a.current == b.current;
 172 }
 173
 174 inline bool
 175 operator!=(const StdinItor& a, const StdinItor& b)
 176 {
 177     return !(a == b);
 178 }
 179
 180 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
 181 // to decode if they occur after the "authority".
 182 //
 183 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
 184 // actually safe to decode in practice) and &+= (which are OK to decode if they
 185 // aren't in the "query" part).
 186 //
 187 // We also need to leave an encoded "%" alone.  We should probably leave an
 188 // encoded "/" alone too (though we shouldn't encounter one in a database
 189 // created by omindex, unless it was in the base URL specified by the user).
 190 //
 191 // This prettifying is aimed at URLs produced by omindex, so we don't currently
 192 // try to decode the query or fragment parts of the URL at all.  We can probably
 193 // safely decode the query in a similar way, but also leaving &+= alone.
 194
 195 enum {
 196     // Always unsafe.
 197     UNSAFE,
 198     // Always safe.
 199     OK,
 200     // Always safe (and 8, 9, a, b, A or B).
 201     OK89AB,
 202     // Safe after a '/'.
 203     INPATH,
 204     // Start of a 2 byte UTF-8 sequence.
 205     SEQ2,
 206     // Start of a 3 byte UTF-8 sequence.
 207     SEQ3,
 208     // Start of a 4 byte UTF-8 sequence.
 209     SEQ4
 210 };
 211
 212 static const char url_chars[256] = {
 213     // 0x00-0x07
 214     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 215     // 0x08-0x0f
 216     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 217     // 0x10-0x17
 218     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 219     // 0x18-0x1f
 220     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 221     // ' '      !       "       #       $       %       &       '
 222     OK,         OK,     OK,     UNSAFE, OK,     UNSAFE, OK,     OK,
 223     // (        )       *       +       ,       -       .       /
 224     OK,         OK,     OK,     OK,     OK,     OK,     OK,     UNSAFE,
 225     // 0        1       2       3       4       5       6       7
 226     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 227     // 8        9       :       ;       <       =       >       ?
 228     OK89AB,     OK89AB, INPATH, OK,     OK,     OK,     OK,     UNSAFE,
 229     // @        A       B       C       D       E       F       G
 230     INPATH,     OK89AB, OK89AB, OK,     OK,     OK,     OK,     OK,
 231     // H        I       J       K       L       M       N       O
 232     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 233     // P        Q       R       S       T       U       V       W
 234     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 235     // X        Y       Z       [       \       ]       ^       _
 236     OK,         OK,     OK,     INPATH, OK,     INPATH, OK,     OK,
 237     // `        a       b       c       d       e       f       g
 238     OK,         OK89AB, OK89AB, OK,     OK,     OK,     OK,     OK,
 239     // h        i       j       k       l       m       n       o
 240     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 241     // p        q       r       s       t       u       v       w
 242     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 243     // x        y       z       {       |       }       ~       0x7f
 244     OK,         OK,     OK,     OK,     OK,     OK,     OK,     UNSAFE,
 245     // 0x80     0x81    0x82    0x83    0x84    0x85    0x86    0x87
 246     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 247     // 0x88     0x89    0x8a    0x8b    0x8c    0x8d    0x8e    0x8f
 248     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 249     // 0x90     0x91    0x92    0x93    0x94    0x95    0x96    0x97
 250     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 251     // 0x98     0x99    0x9a    0x9b    0x9c    0x9d    0x9e    0x9f
 252     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 253     // 0xa0     0xa1    0xa2    0xa3    0xa4    0xa5    0xa6    0xa7
 254     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 255     // 0xa8     0xa9    0xaa    0xab    0xac    0xad    0xae    0xaf
 256     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 257     // 0xb0     0xb1    0xb2    0xb3    0xb4    0xb5    0xb6    0xb7
 258     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 259     // 0xb8     0xb9    0xba    0xbb    0xbc    0xbd    0xbe    0xbf
 260     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 261     // 0xc0     0xc1    0xc2    0xc3    0xc4    0xc5    0xc6    0xc7
 262     UNSAFE,     UNSAFE, SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 263     // 0xc8     0xc9    0xca    0xcb    0xcc    0xcd    0xce    0xcf
 264     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 265     // 0xd0     0xd1    0xd2    0xd3    0xd4    0xd5    0xd6    0xd7
 266     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 267     // 0xd8     0xd9    0xda    0xdb    0xdc    0xdd    0xde    0xdf
 268     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 269     // 0xe0     0xe1    0xe2    0xe3    0xe4    0xe5    0xe6    0xe7
 270     SEQ3,       SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,
 271     // 0xe8     0xe9    0xea    0xeb    0xec    0xed    0xee    0xef
 272     SEQ3,       SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,
 273     // 0xf0     0xf1    0xf2    0xf3    0xf4    0xf5    0xf6    0xf7
 274     SEQ4,       SEQ4,   SEQ4,   SEQ4,   SEQ4,   UNSAFE, UNSAFE, UNSAFE,
 275     // 0xf8     0xf9    0xfa    0xfb    0xfc    0xfd    0xfe    0xff
 276     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE
 277 };
 278
 279 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
 280 // and a hex digit.
 281 inline bool
 282 encoded_ucont(const std::string & s, size_t i)
 283 {
 284     return s[i] == '%' &&
 285         url_chars[static_cast<unsigned char>(s[i + 1])] == OK89AB &&
 286         C_isxdigit(s[i + 2]);
 287 }
 288
 289 /** Prettify a URL.
 290  *
 291  *  Undo RFC3986 escaping which doesn't affect semantics in practice, to make
 292  *  a prettier version of a URL to show the user, but which should still work
 293  *  if copied and pasted.
 294  */
 295 inline void
 296 url_prettify(std::string & url)
 297 {
 298     size_t pcent = url.find('%');
 299     // Fast path for URLs without a '%' in.
 300     if (pcent == std::string::npos)
 301         return;
 302
 303     if (url.size() < 3)
 304         return;
 305
 306     // Don't try to decode the query or fragment, and don't try to decode if
 307     // there aren't 2 characters after the '%'.
 308     size_t pretty_limit = std::min(url.find_first_of("?#"), url.size() - 2);
 309     if (pcent >= pretty_limit)
 310         return;
 311
 312     size_t slash = std::string::npos;
 313     size_t start = 0;
 314     std::string in;
 315     swap(in, url);
 316     url.reserve(in.size());
 317     while (true) {
 318         // We've checked there are at least two bytes after the '%' already.
 319         if (C_isxdigit(in[pcent + 1]) && C_isxdigit(in[pcent + 2])) {
 320             unsigned char ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 321             bool safe = true;
 322             switch (url_chars[ch]) {
 323                 case UNSAFE:
 324                     safe = false;
 325                     break;
 326                 case SEQ2:
 327                     if (in.size() - (pcent + 2) < 3 ||
 328                         !encoded_ucont(in, pcent + 3)) {
 329                         safe = false;
 330                         break;
 331                     }
 332                     url.append(in, start, pcent - start);
 333                     url += char(ch);
 334                     pcent += 3;
 335                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 336                     start = pcent;
 337                     break;
 338                 case SEQ3:
 339                     if (in.size() - (pcent + 2) < 3 * 2 ||
 340                         !encoded_ucont(in, pcent + 3) ||
 341                         !encoded_ucont(in, pcent + 6) ||
 342                         (ch == 0xe0 && in[pcent + 4] <= '9')) {
 343                         safe = false;
 344                         break;
 345                     }
 346                     url.append(in, start, pcent - start);
 347                     url += char(ch);
 348                     pcent += 3;
 349                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 350                     url += char(ch);
 351                     pcent += 3;
 352                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 353                     start = pcent;
 354                     break;
 355                 case SEQ4:
 356                     if (in.size() - (pcent + 2) < 3 * 3 ||
 357                         !encoded_ucont(in, pcent + 3) ||
 358                         !encoded_ucont(in, pcent + 6) ||
 359                         !encoded_ucont(in, pcent + 9) ||
 360                         (ch == 0xf0 && in[pcent + 4] == '8') ||
 361                         (ch == 0xf4 && in[pcent + 4] >= '9')) {
 362                         safe = false;
 363                         break;
 364                     }
 365                     url.append(in, start, pcent - start);
 366                     url += char(ch);
 367                     pcent += 3;
 368                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 369                     url += char(ch);
 370                     pcent += 3;
 371                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 372                     url += char(ch);
 373                     pcent += 3;
 374                     ch = hex_decode(in[pcent + 1], in[pcent + 2]);
 375                     start = pcent;
 376                     break;
 377                 case INPATH:
 378                     // ':' is safe to decode if there is a single '/' earlier in
 379                     // the URL.
 380                     if (slash == std::string::npos) {
 381                         // Lazily set slash to the position of the first single '/'.
 382                         const char * d = in.data();
 383                         slash = 0;
 384                         while (true) {
 385                             const void* s = std::memchr(d + slash, '/',
 386                                                         pretty_limit - slash);
 387                             if (s == NULL) {
 388                                 slash = in.size();
 389                                 break;
 390                             }
 391                             slash = reinterpret_cast<const char *>(s) - d;
 392                             if (slash == in.size() - 1 || d[slash + 1] != '/')
 393                                 break;
 394                             ++slash;
 395                             while (++slash < in.size() - 1 && d[slash] == '/') { }
 396                         }
 397                     }
 398                     safe = (pcent > slash);
 399                     break;
 400             }
 401
 402             if (safe) {
 403                 url.append(in, start, pcent - start);
 404                 url += char(ch);
 405                 pcent += 3;
 406                 start = pcent;
 407             } else {
 408                 pcent += 3;
 409             }
 410         } else {
 411             ++pcent;
 412         }
 413         pcent = in.find('%', pcent);
 414
 415         if (pcent >= pretty_limit) {
 416             url.append(in, start, std::string::npos);
 417             return;
 418         }
 419     }
 420 }
 421
 422 #endif // OMEGA_INCLUDED_URLDECODE_H