src/protocol/uri.c

   1 /* URL parser and translator; implementation of RFC 2396. */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #ifdef HAVE_IDNA_H
  10 #include <idna.h>
  11 #endif
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/types.h>
  16 #ifdef HAVE_NETDB_H
  17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
  18 #endif
  19
  20 #ifdef HAVE_SYS_SOCKET_H
  21 #include <sys/socket.h>
  22 #endif
  23 #ifdef HAVE_NETINET_IN_H
  24 #include <netinet/in.h>
  25 #endif
  26 #ifdef HAVE_ARPA_INET_H
  27 #include <arpa/inet.h>
  28 #endif
  29
  30 #include "elinks.h"
  31
  32 #include "main/object.h"
  33 #include "protocol/protocol.h"
  34 #include "protocol/uri.h"
  35 #include "util/conv.h"
  36 #include "util/error.h"
  37 #include "util/file.h"
  38 #include "util/hash.h"
  39 #include "util/memory.h"
  40 #include "util/string.h"
  41
  42
  43 static inline int
  44 end_of_dir(unsigned char c)
  45 {
  46         /* This used to check for c == ';' as well.  But section 3.3
  47          * of RFC 2396 explicitly says that parameters in a path
  48          * segment "are not significant to the parsing of relative
  49          * references."  */
  50         return c == POST_CHAR || c == '#' || c == '?';
  51 }
  52
  53 static inline int
  54 is_uri_dir_sep(const struct uri *uri, unsigned char pos)
  55 {
  56         return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
  57 }
  58
  59
  60 int
  61 is_in_domain(unsigned char *domain, unsigned char *server, int server_len)
  62 {
  63         int domain_len = strlen(domain);
  64         int len;
  65
  66         if (domain_len > server_len)
  67                 return 0;
  68
  69         if (domain_len == server_len)
  70                 return !c_strncasecmp(domain, server, server_len);
  71
  72         len = server_len - domain_len;
  73         if (server[len - 1] != '.')
  74                 return 0;
  75
  76         return !c_strncasecmp(domain, server + len, domain_len);
  77 }
  78
  79 int
  80 is_ip_address(const unsigned char *address, int addresslen)
  81 {
  82         /* The @address has well defined limits so it would be a shame to
  83          * allocate it. */
  84         unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
  85
  86         if (addresslen >= sizeof(buffer))
  87                 return 0;
  88
  89         safe_strncpy(buffer, address, addresslen + 1);
  90
  91 #ifdef HAVE_INET_PTON
  92 #ifdef CONFIG_IPV6
  93         {
  94                 struct sockaddr_in6 addr6;
  95
  96                 if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
  97                         return 1;
  98         }
  99 #endif /* CONFIG_IPV6 */
 100         {
 101                 struct in_addr addr4;
 102
 103                 if (inet_pton(AF_INET, buffer, &addr4) > 0)
 104                         return 1;
 105         }
 106
 107         return 0;
 108 #else
 109         /* FIXME: Is this ever the case? */
 110         return 0;
 111 #endif /* HAVE_INET_PTON */
 112 }
 113
 114
 115 int
 116 end_with_known_tld(const unsigned char *s, int slen)
 117 {
 118         int i;
 119         static const unsigned char *const tld[] =
 120         { "com", "edu", "net",
 121           "org", "gov", "mil",
 122           "int", "biz", "arpa",
 123           "aero", "coop",
 124           "info", "museum",
 125           "name", "pro", NULL };
 126
 127         if (!slen) return -1;
 128         if (slen < 0) slen = strlen(s);
 129
 130         for (i = 0; tld[i]; i++) {
 131                 int tldlen = strlen(tld[i]);
 132                 int pos = slen - tldlen;
 133
 134                 if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen))
 135                         return pos;
 136         }
 137
 138         return -1;
 139 }
 140
 141 /* XXX: this function writes to @name. */
 142 static int
 143 check_whether_file_exists(unsigned char *name)
 144 {
 145         /* Check POST_CHAR etc ... */
 146         static const unsigned char chars[] = POST_CHAR_S "#?";
 147         int i;
 148         int namelen = strlen(name);
 149
 150         if (file_exists(name))
 151                 return namelen;
 152
 153         for (i = 0; i < sizeof(chars) - 1; i++) {
 154                 unsigned char *pos = memchr(name, chars[i], namelen);
 155                 int exists;
 156
 157                 if (!pos) continue;
 158
 159                 *pos = 0;
 160                 exists = file_exists(name);
 161                 *pos = chars[i];
 162
 163                 if (exists) {
 164                         return pos - name;
 165                 }
 166         }
 167
 168         return -1;
 169 }
 170
 171 /* Encodes URIs without encoding stuff like fragments and query separators. */
 172 static void
 173 encode_file_uri_string(struct string *string, unsigned char *uristring)
 174 {
 175         int filenamelen = check_whether_file_exists(uristring);
 176
 177         encode_uri_string(string, uristring, filenamelen, 0);
 178         if (filenamelen > 0) add_to_string(string, uristring + filenamelen);
 179 }
 180
 181
 182 static inline int
 183 get_protocol_length(const unsigned char *url)
 184 {
 185         unsigned char *end = (unsigned char *) url;
 186
 187         /* Seek the end of the protocol name if any. */
 188         /* RFC1738:
 189          * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
 190          * (but per its recommendations we accept "upalpha" too) */
 191         while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
 192                 end++;
 193
 194         /* Now we make something to support our "IP version in protocol scheme
 195          * name" hack and silently chop off the last digit if it's there. The
 196          * IETF's not gonna notice I hope or it'd be going after us hard. */
 197         if (end != url && isdigit(end[-1]))
 198                 end--;
 199
 200         /* Also return 0 if there's no protocol name (@end == @url). */
 201         return (*end == ':' || isdigit(*end)) ? end - url : 0;
 202 }
 203
 204 enum uri_errno
 205 parse_uri(struct uri *uri, unsigned char *uristring)
 206 {
 207         unsigned char *prefix_end, *host_end;
 208 #ifdef CONFIG_IPV6
 209         unsigned char *lbracket, *rbracket;
 210 #endif
 211
 212         assertm(uristring != NULL, "No uri to parse.");
 213         memset(uri, 0, sizeof(*uri));
 214
 215         /* Nothing to do for an empty url. */
 216         if_assert_failed return 0;
 217         if (!*uristring) return URI_ERRNO_EMPTY;
 218
 219         uri->string = uristring;
 220         uri->protocollen = get_protocol_length(uristring);
 221
 222         /* Invalid */
 223         if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
 224
 225         /* Figure out whether the protocol is known */
 226         uri->protocol = get_protocol(struri(uri), uri->protocollen);
 227
 228         prefix_end = uristring + uri->protocollen; /* ':' */
 229
 230         /* Check if there's a digit after the protocol name. */
 231         if (isdigit(*prefix_end)) {
 232                 uri->ip_family = uristring[uri->protocollen] - '0';
 233                 prefix_end++;
 234         }
 235         if (*prefix_end != ':')
 236                 return URI_ERRNO_INVALID_PROTOCOL;
 237         prefix_end++;
 238
 239         /* Skip slashes */
 240
 241         if (prefix_end[0] == '/' && prefix_end[1] == '/') {
 242                 if (prefix_end[2] == '/'
 243                     && get_protocol_need_slash_after_host(uri->protocol))
 244                         return URI_ERRNO_TOO_MANY_SLASHES;
 245
 246                 prefix_end += 2;
 247
 248         } else if (get_protocol_need_slashes(uri->protocol)) {
 249                 return URI_ERRNO_NO_SLASHES;
 250         }
 251
 252         if (get_protocol_free_syntax(uri->protocol)) {
 253                 uri->data = prefix_end;
 254                 uri->datalen = strlen(prefix_end);
 255                 return URI_ERRNO_OK;
 256
 257         } else if (uri->protocol == PROTOCOL_FILE) {
 258                 int datalen = strcspn(prefix_end, "#" POST_CHAR_S);
 259                 unsigned char *frag_or_post = prefix_end + datalen;
 260
 261                 /* Extract the fragment part. */
 262                 if (datalen >= 0) {
 263                         if (*frag_or_post == '#') {
 264                                 uri->fragment = frag_or_post + 1;
 265                                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 266                                 frag_or_post = uri->fragment + uri->fragmentlen;
 267                         }
 268                         if (*frag_or_post == POST_CHAR) {
 269                                 uri->post = frag_or_post + 1;
 270                         }
 271                 } else {
 272                         datalen = strlen(prefix_end);
 273                 }
 274
 275                 /* A bit of a special case, but using the "normal" host
 276                  * parsing seems a bit scary at this point. (see bug 107). */
 277                 if (datalen > 9 && !c_strncasecmp(prefix_end, "localhost/", 10)) {
 278                         prefix_end += 9;
 279                         datalen -= 9;
 280                 }
 281
 282                 uri->data = prefix_end;
 283                 uri->datalen = datalen;
 284
 285                 return URI_ERRNO_OK;
 286         }
 287
 288         /* Isolate host */
 289
 290 #ifdef CONFIG_IPV6
 291         /* Get brackets enclosing IPv6 address */
 292         lbracket = strchr(prefix_end, '[');
 293         if (lbracket) {
 294                 rbracket = strchr(lbracket, ']');
 295                 /* [address] is handled only inside of hostname part (surprisingly). */
 296                 if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
 297                         uri->ipv6 = 1;
 298                 else
 299                         lbracket = rbracket = NULL;
 300         } else {
 301                 rbracket = NULL;
 302         }
 303 #endif
 304
 305         /* Possibly skip auth part */
 306         host_end = prefix_end + strcspn(prefix_end, "@");
 307
 308         if (prefix_end + strcspn(prefix_end, "/") > host_end
 309             && *host_end) { /* we have auth info here */
 310                 unsigned char *user_end;
 311
 312                 /* Allow '@' in the password component */
 313                 while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
 314                         host_end = host_end + 1 + strcspn(host_end + 1, "@");
 315
 316                 user_end = strchr(prefix_end, ':');
 317
 318                 if (!user_end || user_end > host_end) {
 319                         uri->user = prefix_end;
 320                         uri->userlen = host_end - prefix_end;
 321                 } else {
 322                         uri->user = prefix_end;
 323                         uri->userlen = user_end - prefix_end;
 324                         uri->password = user_end + 1;
 325                         uri->passwordlen = host_end - user_end - 1;
 326                 }
 327                 prefix_end = host_end + 1;
 328         }
 329
 330 #ifdef CONFIG_IPV6
 331         if (uri->ipv6)
 332                 host_end = rbracket + strcspn(rbracket, ":/?");
 333         else
 334 #endif
 335                 host_end = prefix_end + strcspn(prefix_end, ":/?");
 336
 337 #ifdef CONFIG_IPV6
 338         if (uri->ipv6) {
 339                 int addrlen = rbracket - lbracket - 1;
 340
 341                 /* Check for valid length.
 342                  * addrlen >= sizeof(hostbuf) is theorically impossible
 343                  * but i keep the test in case of... Safer, imho --Zas */
 344                 assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
 345                         "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
 346                         "Problems are likely to be encountered. Please report "
 347                         "this, it is a security bug!", addrlen, uristring);
 348                 if_assert_failed return URI_ERRNO_IPV6_SECURITY;
 349
 350                 uri->host = lbracket + 1;
 351                 uri->hostlen = addrlen;
 352         } else
 353 #endif
 354         {
 355                 uri->host = prefix_end;
 356                 uri->hostlen = host_end - prefix_end;
 357
 358                 /* Trim trailing '.'s */
 359                 if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
 360                         return URI_ERRNO_TRAILING_DOTS;
 361         }
 362
 363         if (*host_end == ':') { /* we have port here */
 364                 unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
 365
 366                 host_end++;
 367
 368                 uri->port = host_end;
 369                 uri->portlen = port_end - host_end;
 370
 371                 if (uri->portlen == 0)
 372                         return URI_ERRNO_NO_PORT_COLON;
 373
 374                 /* We only use 8 bits for portlen so better check */
 375                 if (uri->portlen != port_end - host_end)
 376                         return URI_ERRNO_INVALID_PORT;
 377
 378                 /* test if port is number */
 379                 /* TODO: possibly lookup for the service otherwise? --pasky */
 380                 for (; host_end < port_end; host_end++)
 381                         if (!isdigit(*host_end))
 382                                 return URI_ERRNO_INVALID_PORT;
 383
 384                 /* Check valid port value, and let show an error message
 385                  * about invalid url syntax. */
 386                 if (uri->port && uri->portlen) {
 387                         int n;
 388
 389                         errno = 0;
 390                         n = strtol(uri->port, NULL, 10);
 391                         if (errno || !uri_port_is_valid(n))
 392                                 return URI_ERRNO_INVALID_PORT;
 393                 }
 394         }
 395
 396         if (*host_end == '/') {
 397                 host_end++;
 398
 399         } else if (get_protocol_need_slash_after_host(uri->protocol)) {
 400                 /* The need for slash after the host component depends on the
 401                  * need for a host component. -- The dangerous mind of Jonah */
 402                 if (!uri->hostlen)
 403                         return URI_ERRNO_NO_HOST;
 404
 405                 return URI_ERRNO_NO_HOST_SLASH;
 406         }
 407
 408         /* Look for #fragment or POST_CHAR */
 409         prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
 410         uri->data = host_end;
 411         uri->datalen = prefix_end - host_end;
 412
 413         if (*prefix_end == '#') {
 414                 uri->fragment = prefix_end + 1;
 415                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 416                 prefix_end = uri->fragment + uri->fragmentlen;
 417         }
 418
 419         if (*prefix_end == POST_CHAR) {
 420                 uri->post = prefix_end + 1;
 421         }
 422
 423         return URI_ERRNO_OK;
 424 }
 425
 426 int
 427 get_uri_port(const struct uri *uri)
 428 {
 429         if (uri->port && uri->portlen) {
 430                 const unsigned char *end = uri->port;
 431                 int port = strtol(uri->port, (char **) &end, 10);
 432
 433                 if (end != uri->port) {
 434                         assert(uri_port_is_valid(port));
 435                         return port;
 436                 }
 437         }
 438
 439         return get_protocol_port(uri->protocol);
 440 }
 441
 442 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
 443
 444 static inline int
 445 compare_component(const unsigned char *a, int alen,
 446                   const unsigned char *b, int blen)
 447 {
 448         /* Check that the length and the strings are both set or unset */
 449         if (alen != blen || !!a != !!b) return 0;
 450
 451         /* Both are unset so that will make a perfect match */
 452         if (!a || !alen) return 1;
 453
 454         /* Let the higher forces decide */
 455         return !memcmp(a, b, blen);
 456 }
 457
 458 #define wants(x) (components & (x))
 459
 460 int
 461 compare_uri(const struct uri *a, const struct uri *b,
 462             enum uri_component components)
 463 {
 464         if (a == b) return 1;
 465         if (!components) return 0;
 466
 467         assertm(can_compare_uri_components(components),
 468                 "compare_uri() is a work in progress. Component unsupported");
 469
 470         return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
 471                 && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
 472                 && (!wants(URI_USER)
 473                     || compare_component(a->user, a->userlen, b->user, b->userlen))
 474                 && (!wants(URI_PASSWORD)
 475                     || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
 476                 && (!wants(URI_HOST)
 477                     || compare_component(a->host, a->hostlen, b->host, b->hostlen))
 478                 && (!wants(URI_PORT)
 479                     || compare_component(a->port, a->portlen, b->port, b->portlen))
 480                 && (!wants(URI_DATA)
 481                     || compare_component(a->data, a->datalen, b->data, b->datalen))
 482                 && (!wants(URI_FRAGMENT)
 483                     || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
 484                 && (!wants(URI_POST)
 485                     || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
 486 }
 487
 488
 489 /* We might need something more intelligent than this Swiss army knife. */
 490 struct string *
 491 add_uri_to_string(struct string *string, const struct uri *uri,
 492                   enum uri_component components)
 493 {
 494         /* Custom or unknown keep the URI untouched. */
 495         if (uri->protocol == PROTOCOL_UNKNOWN)
 496                 return add_to_string(string, struri(uri));
 497
 498         if (wants(URI_PROTOCOL)) {
 499                 add_bytes_to_string(string, uri->string, uri->protocollen);
 500                 if (wants(URI_IP_FAMILY) && uri->ip_family)
 501                         add_long_to_string(string, uri->ip_family);
 502                 add_char_to_string(string, ':');
 503                 if (get_protocol_need_slashes(uri->protocol))
 504                         add_to_string(string, "//");
 505         }
 506
 507         if (wants(URI_USER) && uri->userlen) {
 508                 add_bytes_to_string(string, uri->user, uri->userlen);
 509
 510                 if (wants(URI_PASSWORD) && uri->passwordlen) {
 511                         add_char_to_string(string, ':');
 512                         add_bytes_to_string(string, uri->password,
 513                                                     uri->passwordlen);
 514                 }
 515
 516                 add_char_to_string(string, '@');
 517
 518         } else if (wants(URI_PASSWORD) && uri->passwordlen) {
 519                 add_bytes_to_string(string, uri->password, uri->passwordlen);
 520         }
 521
 522         if (wants(URI_HOST) && uri->hostlen) {
 523                 int add_host = 1;
 524
 525 #ifdef CONFIG_IPV6
 526                 /* Rationale for wants(URI_PORT): The [notation] was invented
 527                  * so that you can have an IPv6 addy and a port together. So
 528                  * we want to use it when that happens, otherwise we need not
 529                  * bother (that happens only when we want it for DNS anyway).
 530                  * I insist on an implied elegancy of this way, but YMMV. ;-)
 531                  * --pasky */
 532                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
 533 #endif
 534 #ifdef CONFIG_IDN
 535                 /* Support for the GNU International Domain Name library.
 536                  *
 537                  * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
 538                  *
 539                  * Now it is probably not perfect because idna_to_ascii_lz()
 540                  * will be using a ``zero terminated input string encoded in
 541                  * the current locale's character set''. Anyway I don't know
 542                  * how to convert anything to UTF-8 or Unicode. --jonas */
 543                 if (wants(URI_IDN)) {
 544                         unsigned char *host = memacpy(uri->host, uri->hostlen);
 545
 546                         if (host) {
 547                                 char *idname;
 548                                 int code = idna_to_ascii_lz(host, &idname, 0);
 549
 550                                 /* FIXME: Return NULL if it coughed? --jonas */
 551                                 if (code == IDNA_SUCCESS) {
 552                                         add_to_string(string, idname);
 553                                         free(idname);
 554                                         add_host = 0;
 555                                 }
 556
 557                                 mem_free(host);
 558                         }
 559                 }
 560
 561 #endif
 562                 if (add_host)
 563                         add_bytes_to_string(string, uri->host, uri->hostlen);
 564
 565 #ifdef CONFIG_IPV6
 566                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
 567 #endif
 568         }
 569
 570         if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
 571                 if (uri->portlen) {
 572                         add_char_to_string(string, ':');
 573                         add_bytes_to_string(string, uri->port, uri->portlen);
 574
 575                 } else if (wants(URI_DEFAULT_PORT)
 576                            && uri->protocol != PROTOCOL_USER) {
 577                         /* For user protocols we don't know a default port.
 578                          * Should user protocols ports be configurable? */
 579                         int port = get_protocol_port(uri->protocol);
 580
 581                         add_char_to_string(string, ':');
 582                         add_long_to_string(string, port);
 583                 }
 584         }
 585
 586         /* Only add slash if we need to separate */
 587         if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
 588             && wants(~(URI_DATA | URI_PORT))
 589             && get_protocol_need_slash_after_host(uri->protocol))
 590                 add_char_to_string(string, '/');
 591
 592         if (wants(URI_DATA) && uri->datalen)
 593                 add_bytes_to_string(string, uri->data, uri->datalen);
 594
 595         /* We can not test uri->datalen here since we need to always
 596          * add '/'. */
 597         if (wants(URI_PATH) || wants(URI_FILENAME)) {
 598                 const unsigned char *filename = uri->data;
 599                 const unsigned char *pos;
 600
 601                 assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
 602                         "URI_FILENAME should be used alone %d", components);
 603
 604                 if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
 605 #ifdef CONFIG_OS_WIN32
 606                         if (uri->protocol != PROTOCOL_FILE)
 607 #endif
 608                         /* FIXME: Add correct separator */
 609                         add_char_to_string(string, '/');
 610                 }
 611
 612                 if (!uri->datalen) return string;
 613
 614                 for (pos = filename; *pos && !end_of_dir(*pos); pos++)
 615                         if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
 616                                 filename = pos + 1;
 617
 618                 return add_bytes_to_string(string, filename, pos - filename);
 619         }
 620
 621         if (wants(URI_QUERY) && uri->datalen) {
 622                 const unsigned char *query = memchr(uri->data, '?', uri->datalen);
 623
 624                 assertm(URI_QUERY == components,
 625                         "URI_QUERY should be used alone %d", components);
 626
 627                 if (!query) return string;
 628
 629                 query++;
 630                 /* Check fragment and POST_CHAR */
 631                 return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
 632         }
 633
 634         if (wants(URI_FRAGMENT) && uri->fragmentlen) {
 635                 add_char_to_string(string, '#');
 636                 add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
 637         }
 638
 639         if (wants(URI_POST) && uri->post) {
 640                 add_char_to_string(string, POST_CHAR);
 641                 add_to_string(string, uri->post);
 642
 643         } else if (wants(URI_POST_INFO) && uri->post) {
 644                 if (!strncmp(uri->post, "text/plain", 10)) {
 645                         add_to_string(string, " (PLAIN TEXT DATA)");
 646
 647                 } else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
 648                         add_to_string(string, " (MULTIPART FORM DATA)");
 649
 650                 } else {
 651                         add_to_string(string, " (POST DATA)");
 652                 }
 653
 654         }
 655
 656         return string;
 657 }
 658
 659 #undef wants
 660
 661 unsigned char *
 662 get_uri_string(const struct uri *uri, enum uri_component components)
 663 {
 664         struct string string;
 665
 666         if (init_string(&string)
 667             && add_uri_to_string(&string, uri, components))
 668                 return string.source;
 669
 670         done_string(&string);
 671         return NULL;
 672 }
 673
 674
 675 struct string *
 676 add_string_uri_to_string(struct string *string, unsigned char *uristring,
 677                          enum uri_component components)
 678 {
 679         struct uri uri;
 680
 681         if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
 682                 return NULL;
 683
 684         return add_uri_to_string(string, &uri, components);
 685 }
 686
 687
 688 #define normalize_uri_reparse(str)      normalize_uri(NULL, str)
 689 #define normalize_uri_noparse(uri)      normalize_uri(uri, struri(uri))
 690
 691 unsigned char *
 692 normalize_uri(struct uri *uri, unsigned char *uristring)
 693 {
 694         unsigned char *parse_string = uristring;
 695         unsigned char *src, *dest, *path;
 696         int need_slash = 0, keep_dslash = 1;
 697         int parse = (uri == NULL);
 698         struct uri uri_struct;
 699
 700         if (!uri) uri = &uri_struct;
 701
 702         /* We need to get the real (proxied) URI but lowercase relevant URI
 703          * parts along the way. */
 704         do {
 705                 if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
 706                         return uristring;
 707
 708                 assert(uri->data);
 709
 710                 /* This is a maybe not the right place but both join_urls() and
 711                  * get_translated_uri() through translate_url() calls this
 712                  * function and then it already works on and modifies an
 713                  * allocated copy. */
 714                 convert_to_lowercase_locale_indep(uri->string, uri->protocollen);
 715                 if (uri->hostlen) convert_to_lowercase_locale_indep(uri->host, uri->hostlen);
 716
 717                 parse = 1;
 718                 parse_string = uri->data;
 719         } while (uri->protocol == PROTOCOL_PROXY);
 720
 721         if (get_protocol_free_syntax(uri->protocol))
 722                 return uristring;
 723
 724         if (uri->protocol != PROTOCOL_UNKNOWN) {
 725                 need_slash = get_protocol_need_slash_after_host(uri->protocol);
 726                 keep_dslash = get_protocol_keep_double_slashes(uri->protocol);
 727         }
 728
 729         path = uri->data - need_slash;
 730         dest = src = path;
 731
 732         /* This loop mangles the URI string by removing ".." and "." segments.
 733          * However it must not alter "//" without reason; see bug 744.  */
 734         while (*dest) {
 735                 /* If the following pieces are the LAST parts of URL, we remove
 736                  * them as well. See RFC 2396 section 5.2 for details. */
 737
 738                 if (end_of_dir(src[0])) {
 739                         /* URL data contains no more path. */
 740                         memmove(dest, src, strlen(src) + 1);
 741                         break;
 742                 }
 743
 744                 if (!is_uri_dir_sep(uri, src[0])) {
 745                         /* This is to reduce indentation */
 746
 747                 } else if (src[1] == '.') {
 748                         if (!src[2]) {
 749                                 /* /. - skip the dot */
 750                                 *dest++ = *src;
 751                                 *dest = 0;
 752                                 break;
 753
 754                         } else if (is_uri_dir_sep(uri, src[2])) {
 755                                 /* /./ - strip that.. */
 756                                 src += 2;
 757                                 continue;
 758
 759                         } else if (src[2] == '.'
 760                                    && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
 761                                 /* /../ or /.. - skip it and preceding element.
 762                                  *
 763                                  * <path> "/foo/bar" <dest> ...
 764                                  * <src> ("/../" or "/..\0") ...
 765                                  *
 766                                  * Remove "bar" and the directory
 767                                  * separator that precedes it.  The
 768                                  * separator will be added back in the
 769                                  * next iteration unless another ".."
 770                                  * follows, in which case it will be
 771                                  * added later.  "bar" may be empty.  */
 772
 773                                 while (dest > path) {
 774                                         dest--;
 775                                         if (is_uri_dir_sep(uri, *dest)) break;
 776                                 }
 777
 778                                 /* <path> "/foo" <dest> "/bar" ...
 779                                  * <src> ("/../" or "/..\0") ... */
 780                                 if (!src[3]) {
 781                                         /* /.. - add ending slash and stop */
 782                                         *dest++ = *src;
 783                                         *dest = 0;
 784                                         break;
 785                                 }
 786
 787                                 src += 3;
 788                                 continue;
 789                         }
 790
 791                 } else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) {
 792                         /* // - ignore first '/'. */
 793                         src += 1;
 794                         continue;
 795                 }
 796
 797                 /* We don't want to access memory past the NUL char. */
 798                 *dest = *src++;
 799                 if (*dest) dest++;
 800         }
 801
 802         return uristring;
 803 }
 804
 805 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
 806  * of just the complete path to file/directory, which the dumb 'file' protocol
 807  * backend can understand. No host parts etc, that is what this function is
 808  * supposed to chew. */
 809 static struct uri *
 810 transform_file_url(struct uri *uri, const unsigned char *cwd)
 811 {
 812         unsigned char *path = uri->data;
 813
 814         assert(uri->protocol == PROTOCOL_FILE && uri->data);
 815
 816         /* Sort out the host part. We currently support only host "localhost"
 817          * (plus empty host part will be assumed to be "localhost" as well).
 818          * As our extensions, '.' will reference to the cwd on localhost
 819          * (originally, when the first thing after file:// wasn't "localhost/",
 820          * we assumed the cwd as well, and pretended that there's no host part
 821          * at all) and '..' to the directory parent to cwd. Another extension
 822          * is that if this is a DOS-like system, the first char in two-char
 823          * host part is uppercase letter and the second char is a colon, it is
 824          * assumed to be a local disk specification. */
 825         /* TODO: Use FTP for non-localhost hosts. --pasky */
 826
 827         /* For URL "file://", we open the current directory. Some other
 828          * browsers instead open root directory, but AFAIK the standard does
 829          * not specify that and this was the original behaviour and it is more
 830          * consistent with our file://./ notation. */
 831
 832         /* Who would name their file/dir '...' ? */
 833         if (*path == '.' || !*path) {
 834                 struct string dir;
 835
 836                 if (!init_string(&dir))
 837                         return NULL;
 838
 839                 encode_uri_string(&dir, cwd, -1, 0);
 840
 841                 /* Either we will end up with '//' and translate_directories()
 842                  * will shorten it or the '/' will mark the inserted cwd as a
 843                  * directory. */
 844                 if (*path == '.') *path = '/';
 845
 846                 /* Insert the current working directory. */
 847                 /* The offset is 7 == sizeof("file://") - 1. */
 848                 insert_in_string(&struri(uri), 7, dir.source, dir.length);
 849
 850                 done_string(&dir);
 851                 return uri;
 852         }
 853
 854 #ifdef DOS_FS
 855         if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
 856                 return NULL;
 857 #endif
 858
 859         for (; *path && !dir_sep(*path); path++);
 860
 861         /* FIXME: We will in fact assume localhost even for non-local hosts,
 862          * until we will support the FTP transformation. --pasky */
 863
 864         memmove(uri->data, path, strlen(path) + 1);
 865         return uri;
 866 }
 867
 868 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
 869
 870 unsigned char *
 871 join_urls(struct uri *base, unsigned char *rel)
 872 {
 873         unsigned char *uristring, *path;
 874         int add_slash = 0;
 875         int translate = 0;
 876         int length = 0;
 877
 878         /* See RFC 1808 */
 879         /* TODO: Support for ';' ? (see the RFC) --pasky */
 880
 881         /* For '#', '?' and '//' we could use get_uri_string() but it might be
 882          * too expensive since it uses granular allocation scheme. I wouldn't
 883          * personally mind tho' because it would be cleaner. --jonas */
 884         if (rel[0] == '#') {
 885                 /* Strip fragment and post part from the base URI and append
 886                  * the fragment string in @rel. */
 887                 length  = base->fragment
 888                         ? base->fragment - struri(base) - 1
 889                         : get_real_uri_length(base);
 890
 891         } else if (rel[0] == '?') {
 892                 /* Strip query, fragment and post part from the base URI and
 893                  * append the query string in @rel. */
 894                 length  = base->fragment ? base->fragment - struri(base) - 1
 895                                          : get_real_uri_length(base);
 896
 897                 uristring = memchr(base->data, '?', base->datalen);
 898                 if (uristring) length = uristring - struri(base);
 899
 900         } else if (rel[0] == '/' && rel[1] == '/') {
 901                 if (!get_protocol_need_slashes(base->protocol))
 902                         return NULL;
 903
 904                 /* Get `<protocol>:' from the base URI and append the `//' part
 905                  * from @rel. */
 906                 length = base->protocollen + 1;
 907
 908                 /* We need to sanitize the relative part and add stuff like
 909                  * host slash. */
 910                 translate = 1;
 911         }
 912
 913         /* If one of the tests above set @length to something useful */
 914         if (length) {
 915                 uristring = memacpy(struri(base), length);
 916                 if (!uristring) return NULL;
 917
 918                 add_to_strn(&uristring, rel);
 919
 920                 if (translate) {
 921                         unsigned char *translated;
 922
 923                         translated = translate_url(uristring, NULL);
 924                         mem_free(uristring);
 925                         return translated;
 926                 }
 927                 return normalize_uri_reparse(uristring);
 928         }
 929
 930         /* Check if there is some protocol name to go for */
 931         length = get_protocol_length(rel);
 932         if (length) {
 933                 switch (get_protocol(rel, length)) {
 934                 case PROTOCOL_UNKNOWN:
 935                 case PROTOCOL_PROXY:
 936                         /* Mysteriously proxy URIs are breaking here ... */
 937                         break;
 938
 939                 case PROTOCOL_FILE:
 940                         /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
 941                          * to translate_url(). */
 942                 default:
 943                         uristring = translate_url(rel, NULL);
 944                         if (uristring) return uristring;
 945                 }
 946         }
 947
 948         assertm(base->data != NULL, "bad base url");
 949         if_assert_failed return NULL;
 950
 951         path = base->data;
 952
 953         /* Either is path blank, but we've slash char before, or path is not
 954          * blank, but doesn't start by a slash (if we'd just stay along with
 955          * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
 956          * should be enough, but I'm not sure and I don't want to break
 957          * anything --pasky). */
 958         /* We skip first char of URL ('/') in parse_url() (ARGH). This
 959          * is reason of all this bug-bearing magic.. */
 960         if (*path) {
 961                 if (!is_uri_dir_sep(base, *path)) path--;
 962         } else {
 963                 if (is_uri_dir_sep(base, path[-1])) path--;
 964         }
 965
 966         if (!is_uri_dir_sep(base, rel[0])) {
 967                 unsigned char *path_end;
 968
 969                 /* The URL is relative. */
 970
 971                 if (!*path) {
 972                         /* There's no path in the URL, but we're going to add
 973                          * something there, and the something doesn't start by
 974                          * a slash. So we need to insert a slash after the base
 975                          * URL. Clever, eh? ;) */
 976                         add_slash = 1;
 977                 }
 978
 979                 for (path_end = path; *path_end; path_end++) {
 980                         if (end_of_dir(*path_end)) break;
 981                         /* Modify the path pointer, so that it'll always point
 982                          * above the last '/' in the URL; later, we'll copy the
 983                          * URL only _TO_ this point, and anything after last
 984                          * slash will be substituted by 'rel'. */
 985                         if (is_uri_dir_sep(base, *path_end))
 986                                 path = path_end + 1;
 987                 }
 988         }
 989
 990         length = path - struri(base);
 991         uristring = mem_alloc(length + strlen(rel) + add_slash + 1);
 992         if (!uristring) return NULL;
 993
 994         memcpy(uristring, struri(base), length);
 995         if (add_slash) uristring[length] = '/';
 996         strcpy(uristring + length + add_slash, rel);
 997
 998         return normalize_uri_reparse(uristring);
 999 }
1000
1001
1002 /* Tries to figure out what protocol @newurl might be specifying by checking if
1003  * it exists as a file locally or by checking parts of the host name. */
1004 static enum protocol
1005 find_uri_protocol(unsigned char *newurl)
1006 {
1007         unsigned char *ch;
1008
1009         /* First see if it is a file so filenames that look like hostnames
1010          * won't confuse us below. */
1011         if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
1012
1013         /* Yes, it would be simpler to make test for IPv6 address first,
1014          * but it would result in confusing mix of ifdefs ;-). */
1015         /* FIXME: Ideas for improve protocol detection
1016          *
1017          * - Handle common hostnames. It could be part of the protocol backend
1018          *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
1019          *
1020          * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
1021          */
1022
1023         ch = newurl + strcspn(newurl, ".:/@");
1024         if (*ch == '@'
1025             || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
1026             || !c_strncasecmp(newurl, "ftp.", 4)) {
1027                 /* Contains user/password/ftp-hostname */
1028                 return PROTOCOL_FTP;
1029
1030 #ifdef CONFIG_IPV6
1031         } else if (*newurl == '[' && *ch == ':') {
1032                 /* Candidate for IPv6 address */
1033                 unsigned char *bracket2, *colon2;
1034
1035                 ch++;
1036                 bracket2 = strchr(ch, ']');
1037                 colon2 = strchr(ch, ':');
1038                 if (bracket2 && colon2 && bracket2 > colon2)
1039                         return PROTOCOL_HTTP;
1040 #endif
1041
1042         } else if (*newurl != '.' && *ch == '.') {
1043                 /* Contains domain name? */
1044                 unsigned char *host_end, *domain;
1045                 unsigned char *ipscan;
1046
1047                 /* Process the hostname */
1048                 for (domain = ch + 1;
1049                         *(host_end = domain + strcspn(domain, ".:/?")) == '.';
1050                         domain = host_end + 1);
1051
1052                 /* It's IP? */
1053                 for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1054                         ipscan++);
1055
1056                 if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1057                         return PROTOCOL_HTTP;
1058
1059                 /* It's two-letter or known TLD? */
1060                 if (host_end - domain == 2
1061                     || end_with_known_tld(domain, host_end - domain) >= 0)
1062                         return PROTOCOL_HTTP;
1063         }
1064
1065         return PROTOCOL_UNKNOWN;
1066 }
1067
1068
1069 #define MAX_TRANSLATION_ATTEMPTS        32
1070
1071 /* Returns an URI string that can be used internally. Adding protocol prefix,
1072  * missing slashes etc. */
1073 static unsigned char *
1074 translate_url(unsigned char *url, unsigned char *cwd)
1075 {
1076         unsigned char *newurl;
1077         struct uri uri;
1078         enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1079         int retries = 0;
1080
1081         /* Strip starting spaces */
1082         while (*url == ' ') url++;
1083         if (!*url) return NULL;
1084
1085         newurl = expand_tilde(url); /* XXX: Post data copy. */
1086         if (!newurl) return NULL;
1087
1088 parse_uri:
1089         /* Yay a goto loop. If we get some URI parse error and try to
1090          * fix it we go back to here and try again. */
1091         /* Ordinary parse */
1092         uri_errno = parse_uri(&uri, newurl);
1093
1094         /* Bail out if the same error occurs twice */
1095         if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1096                 if (retries > MAX_TRANSLATION_ATTEMPTS) {
1097                         ERROR("Maximum number of parsing attempts exceeded "
1098                               "for %s.", url);
1099                 }
1100                 mem_free(newurl);
1101                 return NULL;
1102         }
1103
1104         prev_errno = uri_errno;
1105
1106         switch (uri_errno) {
1107         case URI_ERRNO_OK:
1108                 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1109                  * interpreted as the protocol name. */
1110                 if (uri.protocol == PROTOCOL_UNKNOWN) {
1111                         enum protocol protocol = find_uri_protocol(newurl);
1112
1113                         /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1114                          * case. */
1115                         if (protocol != PROTOCOL_UNKNOWN) {
1116                                 struct string str;
1117
1118                                 if (!init_string(&str)) return NULL;
1119
1120                                 switch (protocol) {
1121                                 case PROTOCOL_FTP:
1122                                         add_to_string(&str, "ftp://");
1123                                         encode_uri_string(&str, newurl, -1, 0);
1124                                         break;
1125
1126                                 case PROTOCOL_HTTP:
1127                                         add_to_string(&str, "http://");
1128                                         add_to_string(&str, newurl);
1129                                         break;
1130
1131                                 case PROTOCOL_UNKNOWN:
1132                                         break;
1133
1134                                 case PROTOCOL_FILE:
1135                                 default:
1136                                         add_to_string(&str, "file://");
1137                                         if (!dir_sep(*newurl))
1138                                                 add_to_string(&str, "./");
1139
1140                                         add_to_string(&str, newurl);
1141                                 }
1142
1143                                 mem_free(newurl);
1144                                 newurl = str.source;
1145
1146                                 /* Work around the infinite loop prevention */
1147                                 prev_errno = URI_ERRNO_EMPTY;
1148                                 goto parse_uri;
1149                         }
1150                 }
1151
1152                 /* If file:// URI is transformed we need to reparse. */
1153                 if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1154                     && transform_file_url(&uri, cwd))
1155                         return normalize_uri_reparse(struri(&uri));
1156
1157                 /* Translate the proxied URI too if proxy:// */
1158                 if (uri.protocol == PROTOCOL_PROXY) {
1159                         unsigned char *data = translate_url(uri.data, cwd);
1160                         int pos = uri.data - struri(&uri);
1161
1162                         if (!data) break;
1163                         struri(&uri)[pos] = 0;
1164                         insert_in_string(&struri(&uri), pos, data, strlen(data));
1165                         mem_free(data);
1166                         return normalize_uri_reparse(struri(&uri));
1167                 }
1168
1169                 return normalize_uri_noparse(&uri);
1170
1171         case URI_ERRNO_TOO_MANY_SLASHES:
1172         {
1173                 unsigned char *from, *to;
1174
1175                 assert(uri.string[uri.protocollen] == ':'
1176                        && uri.string[uri.protocollen + 1] == '/'
1177                        && uri.string[uri.protocollen + 2] == '/');
1178
1179                 from = to = uri.string + uri.protocollen + 3;
1180                 while (*from == '/') from++;
1181
1182                 assert(to < from);
1183                 memmove(to, from, strlen(from) + 1);
1184                 goto parse_uri;
1185         }
1186         case URI_ERRNO_NO_SLASHES:
1187         {
1188                 /* Try prefix:some.url -> prefix://some.url.. */
1189                 int slashes = 2;
1190
1191                 /* Check if only one '/' is needed. */
1192                 if (uri.string[uri.protocollen + 1] == '/')
1193                         slashes--;
1194
1195                 insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1196                 goto parse_uri;
1197         }
1198         case URI_ERRNO_TRAILING_DOTS:
1199         {
1200                 /* Trim trailing '.'s */
1201                 unsigned char *from = uri.host + uri.hostlen;
1202                 unsigned char *to = from;
1203
1204                 assert(uri.host < to && to[-1] == '.' && *from != '.');
1205
1206                 while (uri.host < to && to[-1] == '.') to--;
1207
1208                 assert(to < from);
1209                 memmove(to, from, strlen(from) + 1);
1210                 goto parse_uri;
1211         }
1212         case URI_ERRNO_NO_PORT_COLON:
1213                 assert(uri.portlen == 0
1214                        && uri.string < uri.port
1215                        && uri.port[-1] == ':');
1216
1217                 memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1218                 goto parse_uri;
1219
1220         case URI_ERRNO_NO_HOST_SLASH:
1221         {
1222                 int offset = uri.port
1223                            ? uri.port + uri.portlen - struri(&uri)
1224                            : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1225
1226                 assertm(uri.host != NULL, "uri.host not set after no host slash error");
1227                 insert_in_string(&newurl, offset, "/", 1);
1228                 goto parse_uri;
1229         }
1230         case URI_ERRNO_INVALID_PROTOCOL:
1231         {
1232                 /* No protocol name */
1233                 enum protocol protocol = find_uri_protocol(newurl);
1234                 struct string str;
1235
1236                 if (!init_string(&str)) return NULL;
1237
1238                 switch (protocol) {
1239                         case PROTOCOL_FTP:
1240                                 add_to_string(&str, "ftp://");
1241                                 encode_uri_string(&str, newurl, -1, 0);
1242                                 break;
1243
1244                         case PROTOCOL_HTTP:
1245                                 add_to_string(&str, "http://");
1246                                 add_to_string(&str, newurl);
1247                                 break;
1248
1249                         case PROTOCOL_UNKNOWN:
1250                                 /* We default to file:// even though we already
1251                                  * tested if the file existed since it will give
1252                                  * a "No such file or directory" error.  which
1253                                  * might better hint the user that there was
1254                                  * problem figuring out the URI. */
1255                         case PROTOCOL_FILE:
1256                         default:
1257                                 add_to_string(&str, "file://");
1258                                 if (!dir_sep(*newurl))
1259                                         add_to_string(&str, "./");
1260
1261                                 encode_file_uri_string(&str, newurl);
1262                 }
1263
1264                 mem_free(newurl);
1265                 newurl = str.source;
1266
1267                 goto parse_uri;
1268         }
1269         case URI_ERRNO_EMPTY:
1270         case URI_ERRNO_IPV6_SECURITY:
1271         case URI_ERRNO_NO_HOST:
1272         case URI_ERRNO_INVALID_PORT:
1273         case URI_ERRNO_INVALID_PORT_RANGE:
1274                 /* None of these can be handled properly. */
1275                 break;
1276         }
1277
1278         mem_free(newurl);
1279         return NULL;
1280 }
1281
1282
1283 struct uri *
1284 get_composed_uri(struct uri *uri, enum uri_component components)
1285 {
1286         unsigned char *string;
1287
1288         assert(uri);
1289         if_assert_failed return NULL;
1290
1291         string = get_uri_string(uri, components);
1292         if (!string) return NULL;
1293
1294         uri = get_uri(string, 0);
1295         mem_free(string);
1296
1297         return uri;
1298 }
1299
1300 struct uri *
1301 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1302 {
1303         struct uri *uri;
1304
1305         uristring = translate_url(uristring, cwd);
1306         if (!uristring) return NULL;
1307
1308         uri = get_uri(uristring, 0);
1309         mem_free(uristring);
1310
1311         return uri;
1312 }
1313
1314
1315 unsigned char *
1316 get_extension_from_uri(struct uri *uri)
1317 {
1318         unsigned char *extension = NULL;
1319         int afterslash = 1;
1320         unsigned char *pos = uri->data;
1321
1322         assert(pos);
1323
1324         for (; *pos && !end_of_dir(*pos); pos++) {
1325                 if (!afterslash && !extension && *pos == '.') {
1326                         extension = pos;
1327                 } else if (is_uri_dir_sep(uri, *pos)) {
1328                         extension = NULL;
1329                         afterslash = 1;
1330                 } else {
1331                         afterslash = 0;
1332                 }
1333         }
1334
1335         if (extension && extension < pos)
1336                 return memacpy(extension, pos - extension);
1337
1338         return NULL;
1339 }
1340
1341 /* URI encoding, escaping unallowed characters. */
1342 static inline int
1343 safe_char(unsigned char c)
1344 {
1345         /* RFC 2396, Page 8, Section 2.3 ;-) */
1346         return isident(c) || c == '.' || c == '!' || c == '~'
1347                || c == '*' || c == '\''|| c == '(' || c == ')';
1348 }
1349
1350 void
1351 encode_uri_string(struct string *string, const unsigned char *name, int namelen,
1352                   int convert_slashes)
1353 {
1354         unsigned char n[4];
1355         const unsigned char *end;
1356
1357         n[0] = '%';
1358         n[3] = '\0';
1359
1360         if (namelen < 0) namelen = strlen(name);
1361
1362         for (end = name + namelen; name < end; name++) {
1363 #if 0
1364                 /* This is probably correct only for query part of URI..? */
1365                 if (*name == ' ') add_char_to_string(data, len, '+');
1366                 else
1367 #endif
1368                 if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1369                         add_char_to_string(string, *name);
1370                 } else {
1371                         /* Hex it. */
1372                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1373                         n[2] = hx(((int) *name) & 0xF);
1374                         add_bytes_to_string(string, n, sizeof(n) - 1);
1375                 }
1376         }
1377 }
1378
1379 void
1380 encode_win32_uri_string(struct string *string, unsigned char *name, int namelen)
1381 {
1382         unsigned char n[4];
1383         unsigned char *end;
1384
1385         n[0] = '%';
1386         n[3] = '\0';
1387
1388         if (namelen < 0) namelen = strlen(name);
1389
1390         for (end = name + namelen; name < end; name++) {
1391                 if (safe_char(*name) || *name == ':' || *name == '\\') {
1392                         add_char_to_string(string, *name);
1393                 } else {
1394                         /* Hex it. */
1395                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1396                         n[2] = hx(((int) *name) & 0xF);
1397                         add_bytes_to_string(string, n, sizeof(n) - 1);
1398                 }
1399         }
1400 }
1401
1402 /* This function is evil, it modifies its parameter. */
1403 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1404  * efficient way to do that, imho. --Zas */
1405 void
1406 decode_uri(unsigned char *src)
1407 {
1408         unsigned char *dst = src;
1409         unsigned char c;
1410
1411         do {
1412                 c = *src++;
1413
1414                 if (c == '%') {
1415                         int x1 = unhx(*src);
1416
1417                         if (x1 >= 0) {
1418                                 int x2 = unhx(*(src + 1));
1419
1420                                 if (x2 >= 0) {
1421                                         x1 = (x1 << 4) + x2;
1422                                         if (x1 != 0) { /* don't allow %00 */
1423                                                 c = (unsigned char) x1;
1424                                                 src += 2;
1425                                         }
1426                                 }
1427                         }
1428
1429 #if 0
1430                 } else if (c == '+') {
1431                         /* As the comment in encode_uri_string suggests, '+'
1432                          * should only be decoded in the query part of a URI
1433                          * (should that be 'URL'?). I'm not bold enough to
1434                          * disable this code, tho. -- Miciah */
1435                         c = ' ';
1436 #endif
1437                 }
1438
1439                 *dst++ = c;
1440         } while (c != '\0');
1441 }
1442
1443 void
1444 decode_uri_string(struct string *string)
1445 {
1446         decode_uri(string->source);
1447         string->length = strlen(string->source);
1448 }
1449
1450 void
1451 decode_uri_for_display(unsigned char *src)
1452 {
1453         decode_uri(src);
1454
1455         for (; *src; src++)
1456                 if (!isprint(*src) || iscntrl(*src))
1457                         *src = '*';
1458 }
1459
1460 void
1461 decode_uri_string_for_display(struct string *string)
1462 {
1463         decode_uri_for_display(string->source);
1464         string->length = strlen(string->source);
1465 }
1466
1467
1468 /* URI list */
1469
1470 #define URI_LIST_GRANULARITY 0x3
1471
1472 #define realloc_uri_list(list) \
1473         mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1474                         URI_LIST_GRANULARITY)
1475
1476 struct uri *
1477 add_to_uri_list(struct uri_list *list, struct uri *uri)
1478 {
1479         if (!realloc_uri_list(list))
1480                 return NULL;
1481
1482         list->uris[list->size++] = get_uri_reference(uri);
1483
1484         return uri;
1485 };
1486
1487 void
1488 free_uri_list(struct uri_list *list)
1489 {
1490         struct uri *uri;
1491         int index;
1492
1493         if (!list->uris) return;
1494
1495         foreach_uri (uri, index, list) {
1496                 done_uri(uri);
1497         }
1498
1499         mem_free_set(&list->uris, NULL);
1500         list->size = 0;
1501 }
1502
1503 /* URI cache */
1504
1505 struct uri_cache_entry {
1506         struct uri uri;
1507         unsigned char string[1];
1508 };
1509
1510 struct uri_cache {
1511         struct hash *map;
1512         struct object object;
1513 };
1514
1515 static struct uri_cache uri_cache;
1516
1517 #ifdef CONFIG_DEBUG
1518 static inline void
1519 check_uri_sanity(struct uri *uri)
1520 {
1521         int pos;
1522
1523         for (pos = 0; pos < uri->protocollen; pos++)
1524                 if (c_isupper(uri->string[pos])) goto error;
1525
1526         if (uri->hostlen)
1527                 for (pos = 0; pos < uri->hostlen; pos++)
1528                         if (c_isupper(uri->host[pos])) goto error;
1529         return;
1530 error:
1531         INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1532 }
1533 #else
1534 #define check_uri_sanity(uri)
1535 #endif
1536
1537 static inline struct uri_cache_entry *
1538 get_uri_cache_entry(unsigned char *string, int length)
1539 {
1540         struct uri_cache_entry *entry;
1541         struct hash_item *item;
1542
1543         assert(string && length > 0);
1544         if_assert_failed return NULL;
1545
1546         item = get_hash_item(uri_cache.map, string, length);
1547         if (item) return item->value;
1548
1549         /* Setup a new entry */
1550
1551         entry = mem_calloc(1, sizeof(*entry) + length);
1552         if (!entry) return NULL;
1553
1554         object_nolock(&entry->uri, "uri");
1555         memcpy(&entry->string, string, length);
1556         string = entry->string;
1557
1558         if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1559             || !add_hash_item(uri_cache.map, string, length, entry)) {
1560                 mem_free(entry);
1561                 return NULL;
1562         }
1563
1564         object_lock(&uri_cache);
1565
1566         return entry;
1567 }
1568
1569 struct uri *
1570 get_uri(unsigned char *string, enum uri_component components)
1571 {
1572         struct uri_cache_entry *entry;
1573
1574         assert(string);
1575
1576         if (components) {
1577                 struct uri uri;
1578
1579                 if (parse_uri(&uri, string) != URI_ERRNO_OK)
1580                         return NULL;
1581
1582                 return get_composed_uri(&uri, components);
1583         }
1584
1585         if (!is_object_used(&uri_cache)) {
1586                 uri_cache.map = init_hash8();
1587                 if (!uri_cache.map) return NULL;
1588                 object_nolock(&uri_cache, "uri_cache");
1589         }
1590
1591         entry = get_uri_cache_entry(string, strlen(string));
1592         if (!entry) {
1593                 if (!is_object_used(&uri_cache))
1594                         free_hash(&uri_cache.map);
1595                 return NULL;
1596         }
1597
1598         check_uri_sanity(&entry->uri);
1599         object_nolock(&entry->uri, "uri");
1600         object_lock(&entry->uri);
1601
1602         return &entry->uri;
1603 }
1604
1605 void
1606 done_uri(struct uri *uri)
1607 {
1608         unsigned char *string = struri(uri);
1609         int length = strlen(string);
1610         struct hash_item *item;
1611         struct uri_cache_entry *entry;
1612
1613         assert(is_object_used(&uri_cache));
1614
1615         object_unlock(uri);
1616         if (is_object_used(uri)) return;
1617
1618         item = get_hash_item(uri_cache.map, string, length);
1619         entry = item ? item->value : NULL;
1620
1621         assertm(entry != NULL, "Releasing unknown URI [%s]", string);
1622         del_hash_item(uri_cache.map, item);
1623         mem_free(entry);
1624
1625         /* Last URI frees the cache */
1626         object_unlock(&uri_cache);
1627         if (!is_object_used(&uri_cache))
1628                 free_hash(&uri_cache.map);
1629 }