1 /* URL parser and translator; implementation of RFC 2396. */
15 #include <sys/types.h>
17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
20 #ifdef HAVE_SYS_SOCKET_H
21 #include <sys/socket.h>
23 #ifdef HAVE_NETINET_IN_H
24 #include <netinet/in.h>
26 #ifdef HAVE_ARPA_INET_H
27 #include <arpa/inet.h>
32 #include "main/object.h"
33 #include "protocol/protocol.h"
34 #include "protocol/uri.h"
35 #include "util/conv.h"
36 #include "util/error.h"
37 #include "util/file.h"
38 #include "util/hash.h"
39 #include "util/memory.h"
40 #include "util/string.h"
44 end_of_dir(unsigned char c
)
46 /* This used to check for c == ';' as well. But section 3.3
47 * of RFC 2396 explicitly says that parameters in a path
48 * segment "are not significant to the parsing of relative
50 return c
== POST_CHAR
|| c
== '#' || c
== '?';
54 is_uri_dir_sep(const struct uri
*uri
, unsigned char pos
)
56 return (uri
->protocol
== PROTOCOL_FILE
? dir_sep(pos
) : pos
== '/');
61 is_in_domain(unsigned char *domain
, unsigned char *server
, int server_len
)
63 int domain_len
= strlen(domain
);
66 if (domain_len
> server_len
)
69 if (domain_len
== server_len
)
70 return !c_strncasecmp(domain
, server
, server_len
);
72 len
= server_len
- domain_len
;
73 if (server
[len
- 1] != '.')
76 return !c_strncasecmp(domain
, server
+ len
, domain_len
);
80 is_ip_address(const unsigned char *address
, int addresslen
)
82 /* The @address has well defined limits so it would be a shame to
84 unsigned char buffer
[IP_ADDRESS_BUFFER_SIZE
];
86 if (addresslen
>= sizeof(buffer
))
89 safe_strncpy(buffer
, address
, addresslen
+ 1);
94 struct sockaddr_in6 addr6
;
96 if (inet_pton(AF_INET6
, buffer
, &addr6
.sin6_addr
) > 0)
99 #endif /* CONFIG_IPV6 */
101 struct in_addr addr4
;
103 if (inet_pton(AF_INET
, buffer
, &addr4
) > 0)
109 /* FIXME: Is this ever the case? */
111 #endif /* HAVE_INET_PTON */
116 end_with_known_tld(const unsigned char *s
, int slen
)
119 static const unsigned char *const tld
[] =
120 { "com", "edu", "net",
122 "int", "biz", "arpa",
125 "name", "pro", NULL
};
127 if (!slen
) return -1;
128 if (slen
< 0) slen
= strlen(s
);
130 for (i
= 0; tld
[i
]; i
++) {
131 int tldlen
= strlen(tld
[i
]);
132 int pos
= slen
- tldlen
;
134 if (pos
>= 0 && !c_strncasecmp(&s
[pos
], tld
[i
], tldlen
))
141 /* XXX: this function writes to @name. */
143 check_whether_file_exists(unsigned char *name
)
145 /* Check POST_CHAR etc ... */
146 static const unsigned char chars
[] = POST_CHAR_S
"#?";
148 int namelen
= strlen(name
);
150 if (file_exists(name
))
153 for (i
= 0; i
< sizeof(chars
) - 1; i
++) {
154 unsigned char *pos
= memchr(name
, chars
[i
], namelen
);
160 exists
= file_exists(name
);
171 /* Encodes URIs without encoding stuff like fragments and query separators. */
173 encode_file_uri_string(struct string
*string
, unsigned char *uristring
)
175 int filenamelen
= check_whether_file_exists(uristring
);
177 encode_uri_string(string
, uristring
, filenamelen
, 0);
178 if (filenamelen
> 0) add_to_string(string
, uristring
+ filenamelen
);
183 get_protocol_length(const unsigned char *url
)
185 unsigned char *end
= (unsigned char *) url
;
187 /* Seek the end of the protocol name if any. */
189 * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
190 * (but per its recommendations we accept "upalpha" too) */
191 while (isalnum(*end
) || *end
== '+' || *end
== '-' || *end
== '.')
194 /* Now we make something to support our "IP version in protocol scheme
195 * name" hack and silently chop off the last digit if it's there. The
196 * IETF's not gonna notice I hope or it'd be going after us hard. */
197 if (end
!= url
&& isdigit(end
[-1]))
200 /* Also return 0 if there's no protocol name (@end == @url). */
201 return (*end
== ':' || isdigit(*end
)) ? end
- url
: 0;
205 parse_uri(struct uri
*uri
, unsigned char *uristring
)
207 unsigned char *prefix_end
, *host_end
;
209 unsigned char *lbracket
, *rbracket
;
212 assertm(uristring
!= NULL
, "No uri to parse.");
213 memset(uri
, 0, sizeof(*uri
));
215 /* Nothing to do for an empty url. */
216 if_assert_failed
return 0;
217 if (!*uristring
) return URI_ERRNO_EMPTY
;
219 uri
->string
= uristring
;
220 uri
->protocollen
= get_protocol_length(uristring
);
223 if (!uri
->protocollen
) return URI_ERRNO_INVALID_PROTOCOL
;
225 /* Figure out whether the protocol is known */
226 uri
->protocol
= get_protocol(struri(uri
), uri
->protocollen
);
228 prefix_end
= uristring
+ uri
->protocollen
; /* ':' */
230 /* Check if there's a digit after the protocol name. */
231 if (isdigit(*prefix_end
)) {
232 uri
->ip_family
= uristring
[uri
->protocollen
] - '0';
235 if (*prefix_end
!= ':')
236 return URI_ERRNO_INVALID_PROTOCOL
;
241 if (prefix_end
[0] == '/' && prefix_end
[1] == '/') {
242 if (prefix_end
[2] == '/'
243 && get_protocol_need_slash_after_host(uri
->protocol
))
244 return URI_ERRNO_TOO_MANY_SLASHES
;
248 } else if (get_protocol_need_slashes(uri
->protocol
)) {
249 return URI_ERRNO_NO_SLASHES
;
252 if (get_protocol_free_syntax(uri
->protocol
)) {
253 uri
->data
= prefix_end
;
254 uri
->datalen
= strlen(prefix_end
);
257 } else if (uri
->protocol
== PROTOCOL_FILE
) {
258 int datalen
= strcspn(prefix_end
, "#" POST_CHAR_S
);
259 unsigned char *frag_or_post
= prefix_end
+ datalen
;
261 /* Extract the fragment part. */
263 if (*frag_or_post
== '#') {
264 uri
->fragment
= frag_or_post
+ 1;
265 uri
->fragmentlen
= strcspn(uri
->fragment
, POST_CHAR_S
);
266 frag_or_post
= uri
->fragment
+ uri
->fragmentlen
;
268 if (*frag_or_post
== POST_CHAR
) {
269 uri
->post
= frag_or_post
+ 1;
272 datalen
= strlen(prefix_end
);
275 /* A bit of a special case, but using the "normal" host
276 * parsing seems a bit scary at this point. (see bug 107). */
277 if (datalen
> 9 && !c_strncasecmp(prefix_end
, "localhost/", 10)) {
282 uri
->data
= prefix_end
;
283 uri
->datalen
= datalen
;
291 /* Get brackets enclosing IPv6 address */
292 lbracket
= strchr(prefix_end
, '[');
294 rbracket
= strchr(lbracket
, ']');
295 /* [address] is handled only inside of hostname part (surprisingly). */
296 if (rbracket
&& rbracket
< prefix_end
+ strcspn(prefix_end
, "/"))
299 lbracket
= rbracket
= NULL
;
305 /* Possibly skip auth part */
306 host_end
= prefix_end
+ strcspn(prefix_end
, "@");
308 if (prefix_end
+ strcspn(prefix_end
, "/") > host_end
309 && *host_end
) { /* we have auth info here */
310 unsigned char *user_end
;
312 /* Allow '@' in the password component */
313 while (strcspn(host_end
+ 1, "@") < strcspn(host_end
+ 1, "/?"))
314 host_end
= host_end
+ 1 + strcspn(host_end
+ 1, "@");
316 user_end
= strchr(prefix_end
, ':');
318 if (!user_end
|| user_end
> host_end
) {
319 uri
->user
= prefix_end
;
320 uri
->userlen
= host_end
- prefix_end
;
322 uri
->user
= prefix_end
;
323 uri
->userlen
= user_end
- prefix_end
;
324 uri
->password
= user_end
+ 1;
325 uri
->passwordlen
= host_end
- user_end
- 1;
327 prefix_end
= host_end
+ 1;
332 host_end
= rbracket
+ strcspn(rbracket
, ":/?");
335 host_end
= prefix_end
+ strcspn(prefix_end
, ":/?");
339 int addrlen
= rbracket
- lbracket
- 1;
341 /* Check for valid length.
342 * addrlen >= sizeof(hostbuf) is theorically impossible
343 * but i keep the test in case of... Safer, imho --Zas */
344 assertm(addrlen
>= 0 && addrlen
< NI_MAXHOST
,
345 "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
346 "Problems are likely to be encountered. Please report "
347 "this, it is a security bug!", addrlen
, uristring
);
348 if_assert_failed
return URI_ERRNO_IPV6_SECURITY
;
350 uri
->host
= lbracket
+ 1;
351 uri
->hostlen
= addrlen
;
355 uri
->host
= prefix_end
;
356 uri
->hostlen
= host_end
- prefix_end
;
358 /* Trim trailing '.'s */
359 if (uri
->hostlen
&& uri
->host
[uri
->hostlen
- 1] == '.')
360 return URI_ERRNO_TRAILING_DOTS
;
363 if (*host_end
== ':') { /* we have port here */
364 unsigned char *port_end
= host_end
+ 1 + strcspn(host_end
+ 1, "/");
368 uri
->port
= host_end
;
369 uri
->portlen
= port_end
- host_end
;
371 if (uri
->portlen
== 0)
372 return URI_ERRNO_NO_PORT_COLON
;
374 /* We only use 8 bits for portlen so better check */
375 if (uri
->portlen
!= port_end
- host_end
)
376 return URI_ERRNO_INVALID_PORT
;
378 /* test if port is number */
379 /* TODO: possibly lookup for the service otherwise? --pasky */
380 for (; host_end
< port_end
; host_end
++)
381 if (!isdigit(*host_end
))
382 return URI_ERRNO_INVALID_PORT
;
384 /* Check valid port value, and let show an error message
385 * about invalid url syntax. */
386 if (uri
->port
&& uri
->portlen
) {
390 n
= strtol(uri
->port
, NULL
, 10);
391 if (errno
|| !uri_port_is_valid(n
))
392 return URI_ERRNO_INVALID_PORT
;
396 if (*host_end
== '/') {
399 } else if (get_protocol_need_slash_after_host(uri
->protocol
)) {
400 /* The need for slash after the host component depends on the
401 * need for a host component. -- The dangerous mind of Jonah */
403 return URI_ERRNO_NO_HOST
;
405 return URI_ERRNO_NO_HOST_SLASH
;
408 /* Look for #fragment or POST_CHAR */
409 prefix_end
= host_end
+ strcspn(host_end
, "#" POST_CHAR_S
);
410 uri
->data
= host_end
;
411 uri
->datalen
= prefix_end
- host_end
;
413 if (*prefix_end
== '#') {
414 uri
->fragment
= prefix_end
+ 1;
415 uri
->fragmentlen
= strcspn(uri
->fragment
, POST_CHAR_S
);
416 prefix_end
= uri
->fragment
+ uri
->fragmentlen
;
419 if (*prefix_end
== POST_CHAR
) {
420 uri
->post
= prefix_end
+ 1;
427 get_uri_port(const struct uri
*uri
)
429 if (uri
->port
&& uri
->portlen
) {
430 const unsigned char *end
= uri
->port
;
431 int port
= strtol(uri
->port
, (char **) &end
, 10);
433 if (end
!= uri
->port
) {
434 assert(uri_port_is_valid(port
));
439 return get_protocol_port(uri
->protocol
);
442 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
445 compare_component(const unsigned char *a
, int alen
,
446 const unsigned char *b
, int blen
)
448 /* Check that the length and the strings are both set or unset */
449 if (alen
!= blen
|| !!a
!= !!b
) return 0;
451 /* Both are unset so that will make a perfect match */
452 if (!a
|| !alen
) return 1;
454 /* Let the higher forces decide */
455 return !memcmp(a
, b
, blen
);
458 #define wants(x) (components & (x))
461 compare_uri(const struct uri
*a
, const struct uri
*b
,
462 enum uri_component components
)
464 if (a
== b
) return 1;
465 if (!components
) return 0;
467 assertm(can_compare_uri_components(components
),
468 "compare_uri() is a work in progress. Component unsupported");
470 return (!wants(URI_PROTOCOL
) || a
->protocol
== b
->protocol
)
471 && (!wants(URI_IP_FAMILY
) || a
->ip_family
== b
->ip_family
)
473 || compare_component(a
->user
, a
->userlen
, b
->user
, b
->userlen
))
474 && (!wants(URI_PASSWORD
)
475 || compare_component(a
->password
, a
->passwordlen
, b
->password
, b
->passwordlen
))
477 || compare_component(a
->host
, a
->hostlen
, b
->host
, b
->hostlen
))
479 || compare_component(a
->port
, a
->portlen
, b
->port
, b
->portlen
))
481 || compare_component(a
->data
, a
->datalen
, b
->data
, b
->datalen
))
482 && (!wants(URI_FRAGMENT
)
483 || compare_component(a
->fragment
, a
->fragmentlen
, b
->fragment
, b
->fragmentlen
))
485 || compare_component(a
->post
, a
->post
? strlen(a
->post
) : 0, b
->post
, b
->post
? strlen(b
->post
) : 0));
489 /* We might need something more intelligent than this Swiss army knife. */
491 add_uri_to_string(struct string
*string
, const struct uri
*uri
,
492 enum uri_component components
)
494 /* Custom or unknown keep the URI untouched. */
495 if (uri
->protocol
== PROTOCOL_UNKNOWN
)
496 return add_to_string(string
, struri(uri
));
498 if (wants(URI_PROTOCOL
)) {
499 add_bytes_to_string(string
, uri
->string
, uri
->protocollen
);
500 if (wants(URI_IP_FAMILY
) && uri
->ip_family
)
501 add_long_to_string(string
, uri
->ip_family
);
502 add_char_to_string(string
, ':');
503 if (get_protocol_need_slashes(uri
->protocol
))
504 add_to_string(string
, "//");
507 if (wants(URI_USER
) && uri
->userlen
) {
508 add_bytes_to_string(string
, uri
->user
, uri
->userlen
);
510 if (wants(URI_PASSWORD
) && uri
->passwordlen
) {
511 add_char_to_string(string
, ':');
512 add_bytes_to_string(string
, uri
->password
,
516 add_char_to_string(string
, '@');
518 } else if (wants(URI_PASSWORD
) && uri
->passwordlen
) {
519 add_bytes_to_string(string
, uri
->password
, uri
->passwordlen
);
522 if (wants(URI_HOST
) && uri
->hostlen
) {
526 /* Rationale for wants(URI_PORT): The [notation] was invented
527 * so that you can have an IPv6 addy and a port together. So
528 * we want to use it when that happens, otherwise we need not
529 * bother (that happens only when we want it for DNS anyway).
530 * I insist on an implied elegancy of this way, but YMMV. ;-)
532 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, '[');
535 /* Support for the GNU International Domain Name library.
537 * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
539 * Now it is probably not perfect because idna_to_ascii_lz()
540 * will be using a ``zero terminated input string encoded in
541 * the current locale's character set''. Anyway I don't know
542 * how to convert anything to UTF-8 or Unicode. --jonas */
543 if (wants(URI_IDN
)) {
544 unsigned char *host
= memacpy(uri
->host
, uri
->hostlen
);
548 int code
= idna_to_ascii_lz(host
, &idname
, 0);
550 /* FIXME: Return NULL if it coughed? --jonas */
551 if (code
== IDNA_SUCCESS
) {
552 add_to_string(string
, idname
);
563 add_bytes_to_string(string
, uri
->host
, uri
->hostlen
);
566 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, ']');
570 if (wants(URI_PORT
) || wants(URI_DEFAULT_PORT
)) {
572 add_char_to_string(string
, ':');
573 add_bytes_to_string(string
, uri
->port
, uri
->portlen
);
575 } else if (wants(URI_DEFAULT_PORT
)
576 && uri
->protocol
!= PROTOCOL_USER
) {
577 /* For user protocols we don't know a default port.
578 * Should user protocols ports be configurable? */
579 int port
= get_protocol_port(uri
->protocol
);
581 add_char_to_string(string
, ':');
582 add_long_to_string(string
, port
);
586 /* Only add slash if we need to separate */
587 if ((wants(URI_DATA
) || wants(URI_POST
) || components
== URI_HTTP_REFERRER_HOST
)
588 && wants(~(URI_DATA
| URI_PORT
))
589 && get_protocol_need_slash_after_host(uri
->protocol
))
590 add_char_to_string(string
, '/');
592 if (wants(URI_DATA
) && uri
->datalen
)
593 add_bytes_to_string(string
, uri
->data
, uri
->datalen
);
595 /* We can not test uri->datalen here since we need to always
597 if (wants(URI_PATH
) || wants(URI_FILENAME
)) {
598 const unsigned char *filename
= uri
->data
;
599 const unsigned char *pos
;
601 assertm(!wants(URI_FILENAME
) || components
== URI_FILENAME
,
602 "URI_FILENAME should be used alone %d", components
);
604 if (wants(URI_PATH
) && !is_uri_dir_sep(uri
, *filename
)) {
605 #ifdef CONFIG_OS_WIN32
606 if (uri
->protocol
!= PROTOCOL_FILE
)
608 /* FIXME: Add correct separator */
609 add_char_to_string(string
, '/');
612 if (!uri
->datalen
) return string
;
614 for (pos
= filename
; *pos
&& !end_of_dir(*pos
); pos
++)
615 if (wants(URI_FILENAME
) && is_uri_dir_sep(uri
, *pos
))
618 return add_bytes_to_string(string
, filename
, pos
- filename
);
621 if (wants(URI_QUERY
) && uri
->datalen
) {
622 const unsigned char *query
= memchr(uri
->data
, '?', uri
->datalen
);
624 assertm(URI_QUERY
== components
,
625 "URI_QUERY should be used alone %d", components
);
627 if (!query
) return string
;
630 /* Check fragment and POST_CHAR */
631 return add_bytes_to_string(string
, query
, strcspn(query
, "#" POST_CHAR_S
));
634 if (wants(URI_FRAGMENT
) && uri
->fragmentlen
) {
635 add_char_to_string(string
, '#');
636 add_bytes_to_string(string
, uri
->fragment
, uri
->fragmentlen
);
639 if (wants(URI_POST
) && uri
->post
) {
640 add_char_to_string(string
, POST_CHAR
);
641 add_to_string(string
, uri
->post
);
643 } else if (wants(URI_POST_INFO
) && uri
->post
) {
644 if (!strncmp(uri
->post
, "text/plain", 10)) {
645 add_to_string(string
, " (PLAIN TEXT DATA)");
647 } else if (!strncmp(uri
->post
, "multipart/form-data;", 20)) {
648 add_to_string(string
, " (MULTIPART FORM DATA)");
651 add_to_string(string
, " (POST DATA)");
662 get_uri_string(const struct uri
*uri
, enum uri_component components
)
664 struct string string
;
666 if (init_string(&string
)
667 && add_uri_to_string(&string
, uri
, components
))
668 return string
.source
;
670 done_string(&string
);
676 add_string_uri_to_string(struct string
*string
, unsigned char *uristring
,
677 enum uri_component components
)
681 if (parse_uri(&uri
, uristring
) != URI_ERRNO_OK
)
684 return add_uri_to_string(string
, &uri
, components
);
688 #define normalize_uri_reparse(str) normalize_uri(NULL, str)
689 #define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri))
692 normalize_uri(struct uri
*uri
, unsigned char *uristring
)
694 unsigned char *parse_string
= uristring
;
695 unsigned char *src
, *dest
, *path
;
696 int need_slash
= 0, keep_dslash
= 1;
697 int parse
= (uri
== NULL
);
698 struct uri uri_struct
;
700 if (!uri
) uri
= &uri_struct
;
702 /* We need to get the real (proxied) URI but lowercase relevant URI
703 * parts along the way. */
705 if (parse
&& parse_uri(uri
, parse_string
) != URI_ERRNO_OK
)
710 /* This is a maybe not the right place but both join_urls() and
711 * get_translated_uri() through translate_url() calls this
712 * function and then it already works on and modifies an
714 convert_to_lowercase_locale_indep(uri
->string
, uri
->protocollen
);
715 if (uri
->hostlen
) convert_to_lowercase_locale_indep(uri
->host
, uri
->hostlen
);
718 parse_string
= uri
->data
;
719 } while (uri
->protocol
== PROTOCOL_PROXY
);
721 if (get_protocol_free_syntax(uri
->protocol
))
724 if (uri
->protocol
!= PROTOCOL_UNKNOWN
) {
725 need_slash
= get_protocol_need_slash_after_host(uri
->protocol
);
726 keep_dslash
= get_protocol_keep_double_slashes(uri
->protocol
);
729 path
= uri
->data
- need_slash
;
732 /* This loop mangles the URI string by removing ".." and "." segments.
733 * However it must not alter "//" without reason; see bug 744. */
735 /* If the following pieces are the LAST parts of URL, we remove
736 * them as well. See RFC 2396 section 5.2 for details. */
738 if (end_of_dir(src
[0])) {
739 /* URL data contains no more path. */
740 memmove(dest
, src
, strlen(src
) + 1);
744 if (!is_uri_dir_sep(uri
, src
[0])) {
745 /* This is to reduce indentation */
747 } else if (src
[1] == '.') {
749 /* /. - skip the dot */
754 } else if (is_uri_dir_sep(uri
, src
[2])) {
755 /* /./ - strip that.. */
759 } else if (src
[2] == '.'
760 && (is_uri_dir_sep(uri
, src
[3]) || !src
[3])) {
761 /* /../ or /.. - skip it and preceding element.
763 * <path> "/foo/bar" <dest> ...
764 * <src> ("/../" or "/..\0") ...
766 * Remove "bar" and the directory
767 * separator that precedes it. The
768 * separator will be added back in the
769 * next iteration unless another ".."
770 * follows, in which case it will be
771 * added later. "bar" may be empty. */
773 while (dest
> path
) {
775 if (is_uri_dir_sep(uri
, *dest
)) break;
778 /* <path> "/foo" <dest> "/bar" ...
779 * <src> ("/../" or "/..\0") ... */
781 /* /.. - add ending slash and stop */
791 } else if (is_uri_dir_sep(uri
, src
[1]) && !keep_dslash
) {
792 /* // - ignore first '/'. */
797 /* We don't want to access memory past the NUL char. */
805 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
806 * of just the complete path to file/directory, which the dumb 'file' protocol
807 * backend can understand. No host parts etc, that is what this function is
808 * supposed to chew. */
810 transform_file_url(struct uri
*uri
, const unsigned char *cwd
)
812 unsigned char *path
= uri
->data
;
814 assert(uri
->protocol
== PROTOCOL_FILE
&& uri
->data
);
816 /* Sort out the host part. We currently support only host "localhost"
817 * (plus empty host part will be assumed to be "localhost" as well).
818 * As our extensions, '.' will reference to the cwd on localhost
819 * (originally, when the first thing after file:// wasn't "localhost/",
820 * we assumed the cwd as well, and pretended that there's no host part
821 * at all) and '..' to the directory parent to cwd. Another extension
822 * is that if this is a DOS-like system, the first char in two-char
823 * host part is uppercase letter and the second char is a colon, it is
824 * assumed to be a local disk specification. */
825 /* TODO: Use FTP for non-localhost hosts. --pasky */
827 /* For URL "file://", we open the current directory. Some other
828 * browsers instead open root directory, but AFAIK the standard does
829 * not specify that and this was the original behaviour and it is more
830 * consistent with our file://./ notation. */
832 /* Who would name their file/dir '...' ? */
833 if (*path
== '.' || !*path
) {
836 if (!init_string(&dir
))
839 encode_uri_string(&dir
, cwd
, -1, 0);
841 /* Either we will end up with '//' and translate_directories()
842 * will shorten it or the '/' will mark the inserted cwd as a
844 if (*path
== '.') *path
= '/';
846 /* Insert the current working directory. */
847 /* The offset is 7 == sizeof("file://") - 1. */
848 insert_in_string(&struri(uri
), 7, dir
.source
, dir
.length
);
855 if (isasciialpha(path
[0]) && path
[1] == ':' && dir_sep(path
[2]))
859 for (; *path
&& !dir_sep(*path
); path
++);
861 /* FIXME: We will in fact assume localhost even for non-local hosts,
862 * until we will support the FTP transformation. --pasky */
864 memmove(uri
->data
, path
, strlen(path
) + 1);
868 static unsigned char *translate_url(unsigned char *url
, unsigned char *cwd
);
871 join_urls(struct uri
*base
, unsigned char *rel
)
873 unsigned char *uristring
, *path
;
879 /* TODO: Support for ';' ? (see the RFC) --pasky */
881 /* For '#', '?' and '//' we could use get_uri_string() but it might be
882 * too expensive since it uses granular allocation scheme. I wouldn't
883 * personally mind tho' because it would be cleaner. --jonas */
885 /* Strip fragment and post part from the base URI and append
886 * the fragment string in @rel. */
887 length
= base
->fragment
888 ? base
->fragment
- struri(base
) - 1
889 : get_real_uri_length(base
);
891 } else if (rel
[0] == '?') {
892 /* Strip query, fragment and post part from the base URI and
893 * append the query string in @rel. */
894 length
= base
->fragment
? base
->fragment
- struri(base
) - 1
895 : get_real_uri_length(base
);
897 uristring
= memchr(base
->data
, '?', base
->datalen
);
898 if (uristring
) length
= uristring
- struri(base
);
900 } else if (rel
[0] == '/' && rel
[1] == '/') {
901 if (!get_protocol_need_slashes(base
->protocol
))
904 /* Get `<protocol>:' from the base URI and append the `//' part
906 length
= base
->protocollen
+ 1;
908 /* We need to sanitize the relative part and add stuff like
913 /* If one of the tests above set @length to something useful */
915 uristring
= memacpy(struri(base
), length
);
916 if (!uristring
) return NULL
;
918 add_to_strn(&uristring
, rel
);
921 unsigned char *translated
;
923 translated
= translate_url(uristring
, NULL
);
927 return normalize_uri_reparse(uristring
);
930 /* Check if there is some protocol name to go for */
931 length
= get_protocol_length(rel
);
933 switch (get_protocol(rel
, length
)) {
934 case PROTOCOL_UNKNOWN
:
936 /* Mysteriously proxy URIs are breaking here ... */
940 /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
941 * to translate_url(). */
943 uristring
= translate_url(rel
, NULL
);
944 if (uristring
) return uristring
;
948 assertm(base
->data
!= NULL
, "bad base url");
949 if_assert_failed
return NULL
;
953 /* Either is path blank, but we've slash char before, or path is not
954 * blank, but doesn't start by a slash (if we'd just stay along with
955 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
956 * should be enough, but I'm not sure and I don't want to break
957 * anything --pasky). */
958 /* We skip first char of URL ('/') in parse_url() (ARGH). This
959 * is reason of all this bug-bearing magic.. */
961 if (!is_uri_dir_sep(base
, *path
)) path
--;
963 if (is_uri_dir_sep(base
, path
[-1])) path
--;
966 if (!is_uri_dir_sep(base
, rel
[0])) {
967 unsigned char *path_end
;
969 /* The URL is relative. */
972 /* There's no path in the URL, but we're going to add
973 * something there, and the something doesn't start by
974 * a slash. So we need to insert a slash after the base
975 * URL. Clever, eh? ;) */
979 for (path_end
= path
; *path_end
; path_end
++) {
980 if (end_of_dir(*path_end
)) break;
981 /* Modify the path pointer, so that it'll always point
982 * above the last '/' in the URL; later, we'll copy the
983 * URL only _TO_ this point, and anything after last
984 * slash will be substituted by 'rel'. */
985 if (is_uri_dir_sep(base
, *path_end
))
990 length
= path
- struri(base
);
991 uristring
= mem_alloc(length
+ strlen(rel
) + add_slash
+ 1);
992 if (!uristring
) return NULL
;
994 memcpy(uristring
, struri(base
), length
);
995 if (add_slash
) uristring
[length
] = '/';
996 strcpy(uristring
+ length
+ add_slash
, rel
);
998 return normalize_uri_reparse(uristring
);
1002 /* Tries to figure out what protocol @newurl might be specifying by checking if
1003 * it exists as a file locally or by checking parts of the host name. */
1004 static enum protocol
1005 find_uri_protocol(unsigned char *newurl
)
1009 /* First see if it is a file so filenames that look like hostnames
1010 * won't confuse us below. */
1011 if (check_whether_file_exists(newurl
) >= 0) return PROTOCOL_FILE
;
1013 /* Yes, it would be simpler to make test for IPv6 address first,
1014 * but it would result in confusing mix of ifdefs ;-). */
1015 /* FIXME: Ideas for improve protocol detection
1017 * - Handle common hostnames. It could be part of the protocol backend
1018 * structure. [ www -> http, irc -> irc, news -> nntp, ... ]
1020 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
1023 ch
= newurl
+ strcspn(newurl
, ".:/@");
1025 || (*ch
== ':' && *newurl
!= '[' && strchr(newurl
, '@'))
1026 || !c_strncasecmp(newurl
, "ftp.", 4)) {
1027 /* Contains user/password/ftp-hostname */
1028 return PROTOCOL_FTP
;
1031 } else if (*newurl
== '[' && *ch
== ':') {
1032 /* Candidate for IPv6 address */
1033 unsigned char *bracket2
, *colon2
;
1036 bracket2
= strchr(ch
, ']');
1037 colon2
= strchr(ch
, ':');
1038 if (bracket2
&& colon2
&& bracket2
> colon2
)
1039 return PROTOCOL_HTTP
;
1042 } else if (*newurl
!= '.' && *ch
== '.') {
1043 /* Contains domain name? */
1044 unsigned char *host_end
, *domain
;
1045 unsigned char *ipscan
;
1047 /* Process the hostname */
1048 for (domain
= ch
+ 1;
1049 *(host_end
= domain
+ strcspn(domain
, ".:/?")) == '.';
1050 domain
= host_end
+ 1);
1053 for (ipscan
= ch
; isdigit(*ipscan
) || *ipscan
== '.';
1056 if (!*ipscan
|| *ipscan
== ':' || *ipscan
== '/')
1057 return PROTOCOL_HTTP
;
1059 /* It's two-letter or known TLD? */
1060 if (host_end
- domain
== 2
1061 || end_with_known_tld(domain
, host_end
- domain
) >= 0)
1062 return PROTOCOL_HTTP
;
1065 return PROTOCOL_UNKNOWN
;
1069 #define MAX_TRANSLATION_ATTEMPTS 32
1071 /* Returns an URI string that can be used internally. Adding protocol prefix,
1072 * missing slashes etc. */
1073 static unsigned char *
1074 translate_url(unsigned char *url
, unsigned char *cwd
)
1076 unsigned char *newurl
;
1078 enum uri_errno uri_errno
, prev_errno
= URI_ERRNO_EMPTY
;
1081 /* Strip starting spaces */
1082 while (*url
== ' ') url
++;
1083 if (!*url
) return NULL
;
1085 newurl
= expand_tilde(url
); /* XXX: Post data copy. */
1086 if (!newurl
) return NULL
;
1089 /* Yay a goto loop. If we get some URI parse error and try to
1090 * fix it we go back to here and try again. */
1091 /* Ordinary parse */
1092 uri_errno
= parse_uri(&uri
, newurl
);
1094 /* Bail out if the same error occurs twice */
1095 if (uri_errno
== prev_errno
|| retries
++ > MAX_TRANSLATION_ATTEMPTS
) {
1096 if (retries
> MAX_TRANSLATION_ATTEMPTS
) {
1097 ERROR("Maximum number of parsing attempts exceeded "
1104 prev_errno
= uri_errno
;
1106 switch (uri_errno
) {
1108 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1109 * interpreted as the protocol name. */
1110 if (uri
.protocol
== PROTOCOL_UNKNOWN
) {
1111 enum protocol protocol
= find_uri_protocol(newurl
);
1113 /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1115 if (protocol
!= PROTOCOL_UNKNOWN
) {
1118 if (!init_string(&str
)) return NULL
;
1122 add_to_string(&str
, "ftp://");
1123 encode_uri_string(&str
, newurl
, -1, 0);
1127 add_to_string(&str
, "http://");
1128 add_to_string(&str
, newurl
);
1131 case PROTOCOL_UNKNOWN
:
1136 add_to_string(&str
, "file://");
1137 if (!dir_sep(*newurl
))
1138 add_to_string(&str
, "./");
1140 add_to_string(&str
, newurl
);
1144 newurl
= str
.source
;
1146 /* Work around the infinite loop prevention */
1147 prev_errno
= URI_ERRNO_EMPTY
;
1152 /* If file:// URI is transformed we need to reparse. */
1153 if (uri
.protocol
== PROTOCOL_FILE
&& cwd
&& *cwd
1154 && transform_file_url(&uri
, cwd
))
1155 return normalize_uri_reparse(struri(&uri
));
1157 /* Translate the proxied URI too if proxy:// */
1158 if (uri
.protocol
== PROTOCOL_PROXY
) {
1159 unsigned char *data
= translate_url(uri
.data
, cwd
);
1160 int pos
= uri
.data
- struri(&uri
);
1163 struri(&uri
)[pos
] = 0;
1164 insert_in_string(&struri(&uri
), pos
, data
, strlen(data
));
1166 return normalize_uri_reparse(struri(&uri
));
1169 return normalize_uri_noparse(&uri
);
1171 case URI_ERRNO_TOO_MANY_SLASHES
:
1173 unsigned char *from
, *to
;
1175 assert(uri
.string
[uri
.protocollen
] == ':'
1176 && uri
.string
[uri
.protocollen
+ 1] == '/'
1177 && uri
.string
[uri
.protocollen
+ 2] == '/');
1179 from
= to
= uri
.string
+ uri
.protocollen
+ 3;
1180 while (*from
== '/') from
++;
1183 memmove(to
, from
, strlen(from
) + 1);
1186 case URI_ERRNO_NO_SLASHES
:
1188 /* Try prefix:some.url -> prefix://some.url.. */
1191 /* Check if only one '/' is needed. */
1192 if (uri
.string
[uri
.protocollen
+ 1] == '/')
1195 insert_in_string(&newurl
, uri
.protocollen
+ 1, "//", slashes
);
1198 case URI_ERRNO_TRAILING_DOTS
:
1200 /* Trim trailing '.'s */
1201 unsigned char *from
= uri
.host
+ uri
.hostlen
;
1202 unsigned char *to
= from
;
1204 assert(uri
.host
< to
&& to
[-1] == '.' && *from
!= '.');
1206 while (uri
.host
< to
&& to
[-1] == '.') to
--;
1209 memmove(to
, from
, strlen(from
) + 1);
1212 case URI_ERRNO_NO_PORT_COLON
:
1213 assert(uri
.portlen
== 0
1214 && uri
.string
< uri
.port
1215 && uri
.port
[-1] == ':');
1217 memmove(uri
.port
- 1, uri
.port
, strlen(uri
.port
) + 1);
1220 case URI_ERRNO_NO_HOST_SLASH
:
1222 int offset
= uri
.port
1223 ? uri
.port
+ uri
.portlen
- struri(&uri
)
1224 : uri
.host
+ uri
.hostlen
- struri(&uri
) + uri
.ipv6
/* ']' */;
1226 assertm(uri
.host
!= NULL
, "uri.host not set after no host slash error");
1227 insert_in_string(&newurl
, offset
, "/", 1);
1230 case URI_ERRNO_INVALID_PROTOCOL
:
1232 /* No protocol name */
1233 enum protocol protocol
= find_uri_protocol(newurl
);
1236 if (!init_string(&str
)) return NULL
;
1240 add_to_string(&str
, "ftp://");
1241 encode_uri_string(&str
, newurl
, -1, 0);
1245 add_to_string(&str
, "http://");
1246 add_to_string(&str
, newurl
);
1249 case PROTOCOL_UNKNOWN
:
1250 /* We default to file:// even though we already
1251 * tested if the file existed since it will give
1252 * a "No such file or directory" error. which
1253 * might better hint the user that there was
1254 * problem figuring out the URI. */
1257 add_to_string(&str
, "file://");
1258 if (!dir_sep(*newurl
))
1259 add_to_string(&str
, "./");
1261 encode_file_uri_string(&str
, newurl
);
1265 newurl
= str
.source
;
1269 case URI_ERRNO_EMPTY
:
1270 case URI_ERRNO_IPV6_SECURITY
:
1271 case URI_ERRNO_NO_HOST
:
1272 case URI_ERRNO_INVALID_PORT
:
1273 case URI_ERRNO_INVALID_PORT_RANGE
:
1274 /* None of these can be handled properly. */
1284 get_composed_uri(struct uri
*uri
, enum uri_component components
)
1286 unsigned char *string
;
1289 if_assert_failed
return NULL
;
1291 string
= get_uri_string(uri
, components
);
1292 if (!string
) return NULL
;
1294 uri
= get_uri(string
, 0);
1301 get_translated_uri(unsigned char *uristring
, unsigned char *cwd
)
1305 uristring
= translate_url(uristring
, cwd
);
1306 if (!uristring
) return NULL
;
1308 uri
= get_uri(uristring
, 0);
1309 mem_free(uristring
);
1316 get_extension_from_uri(struct uri
*uri
)
1318 unsigned char *extension
= NULL
;
1320 unsigned char *pos
= uri
->data
;
1324 for (; *pos
&& !end_of_dir(*pos
); pos
++) {
1325 if (!afterslash
&& !extension
&& *pos
== '.') {
1327 } else if (is_uri_dir_sep(uri
, *pos
)) {
1335 if (extension
&& extension
< pos
)
1336 return memacpy(extension
, pos
- extension
);
1341 /* URI encoding, escaping unallowed characters. */
1343 safe_char(unsigned char c
)
1345 /* RFC 2396, Page 8, Section 2.3 ;-) */
1346 return isident(c
) || c
== '.' || c
== '!' || c
== '~'
1347 || c
== '*' || c
== '\''|| c
== '(' || c
== ')';
1351 encode_uri_string(struct string
*string
, const unsigned char *name
, int namelen
,
1352 int convert_slashes
)
1355 const unsigned char *end
;
1360 if (namelen
< 0) namelen
= strlen(name
);
1362 for (end
= name
+ namelen
; name
< end
; name
++) {
1364 /* This is probably correct only for query part of URI..? */
1365 if (*name
== ' ') add_char_to_string(data
, len
, '+');
1368 if (safe_char(*name
) || (!convert_slashes
&& *name
== '/')) {
1369 add_char_to_string(string
, *name
);
1372 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1373 n
[2] = hx(((int) *name
) & 0xF);
1374 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1380 encode_win32_uri_string(struct string
*string
, unsigned char *name
, int namelen
)
1388 if (namelen
< 0) namelen
= strlen(name
);
1390 for (end
= name
+ namelen
; name
< end
; name
++) {
1391 if (safe_char(*name
) || *name
== ':' || *name
== '\\') {
1392 add_char_to_string(string
, *name
);
1395 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1396 n
[2] = hx(((int) *name
) & 0xF);
1397 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1402 /* This function is evil, it modifies its parameter. */
1403 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1404 * efficient way to do that, imho. --Zas */
1406 decode_uri(unsigned char *src
)
1408 unsigned char *dst
= src
;
1415 int x1
= unhx(*src
);
1418 int x2
= unhx(*(src
+ 1));
1421 x1
= (x1
<< 4) + x2
;
1422 if (x1
!= 0) { /* don't allow %00 */
1423 c
= (unsigned char) x1
;
1430 } else if (c
== '+') {
1431 /* As the comment in encode_uri_string suggests, '+'
1432 * should only be decoded in the query part of a URI
1433 * (should that be 'URL'?). I'm not bold enough to
1434 * disable this code, tho. -- Miciah */
1440 } while (c
!= '\0');
1444 decode_uri_string(struct string
*string
)
1446 decode_uri(string
->source
);
1447 string
->length
= strlen(string
->source
);
1451 decode_uri_for_display(unsigned char *src
)
1456 if (!isprint(*src
) || iscntrl(*src
))
1461 decode_uri_string_for_display(struct string
*string
)
1463 decode_uri_for_display(string
->source
);
1464 string
->length
= strlen(string
->source
);
1470 #define URI_LIST_GRANULARITY 0x3
1472 #define realloc_uri_list(list) \
1473 mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1474 URI_LIST_GRANULARITY)
1477 add_to_uri_list(struct uri_list
*list
, struct uri
*uri
)
1479 if (!realloc_uri_list(list
))
1482 list
->uris
[list
->size
++] = get_uri_reference(uri
);
1488 free_uri_list(struct uri_list
*list
)
1493 if (!list
->uris
) return;
1495 foreach_uri (uri
, index
, list
) {
1499 mem_free_set(&list
->uris
, NULL
);
1505 struct uri_cache_entry
{
1507 unsigned char string
[1];
1512 struct object object
;
1515 static struct uri_cache uri_cache
;
1519 check_uri_sanity(struct uri
*uri
)
1523 for (pos
= 0; pos
< uri
->protocollen
; pos
++)
1524 if (c_isupper(uri
->string
[pos
])) goto error
;
1527 for (pos
= 0; pos
< uri
->hostlen
; pos
++)
1528 if (c_isupper(uri
->host
[pos
])) goto error
;
1531 INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri
));
1534 #define check_uri_sanity(uri)
1537 static inline struct uri_cache_entry
*
1538 get_uri_cache_entry(unsigned char *string
, int length
)
1540 struct uri_cache_entry
*entry
;
1541 struct hash_item
*item
;
1543 assert(string
&& length
> 0);
1544 if_assert_failed
return NULL
;
1546 item
= get_hash_item(uri_cache
.map
, string
, length
);
1547 if (item
) return item
->value
;
1549 /* Setup a new entry */
1551 entry
= mem_calloc(1, sizeof(*entry
) + length
);
1552 if (!entry
) return NULL
;
1554 object_nolock(&entry
->uri
, "uri");
1555 memcpy(&entry
->string
, string
, length
);
1556 string
= entry
->string
;
1558 if (parse_uri(&entry
->uri
, string
) != URI_ERRNO_OK
1559 || !add_hash_item(uri_cache
.map
, string
, length
, entry
)) {
1564 object_lock(&uri_cache
);
1570 get_uri(unsigned char *string
, enum uri_component components
)
1572 struct uri_cache_entry
*entry
;
1579 if (parse_uri(&uri
, string
) != URI_ERRNO_OK
)
1582 return get_composed_uri(&uri
, components
);
1585 if (!is_object_used(&uri_cache
)) {
1586 uri_cache
.map
= init_hash8();
1587 if (!uri_cache
.map
) return NULL
;
1588 object_nolock(&uri_cache
, "uri_cache");
1591 entry
= get_uri_cache_entry(string
, strlen(string
));
1593 if (!is_object_used(&uri_cache
))
1594 free_hash(&uri_cache
.map
);
1598 check_uri_sanity(&entry
->uri
);
1599 object_nolock(&entry
->uri
, "uri");
1600 object_lock(&entry
->uri
);
1606 done_uri(struct uri
*uri
)
1608 unsigned char *string
= struri(uri
);
1609 int length
= strlen(string
);
1610 struct hash_item
*item
;
1611 struct uri_cache_entry
*entry
;
1613 assert(is_object_used(&uri_cache
));
1616 if (is_object_used(uri
)) return;
1618 item
= get_hash_item(uri_cache
.map
, string
, length
);
1619 entry
= item
? item
->value
: NULL
;
1621 assertm(entry
!= NULL
, "Releasing unknown URI [%s]", string
);
1622 del_hash_item(uri_cache
.map
, item
);
1625 /* Last URI frees the cache */
1626 object_unlock(&uri_cache
);
1627 if (!is_object_used(&uri_cache
))
1628 free_hash(&uri_cache
.map
);