[uri] Make URI parsing handle IPv6 addresses
[gpxe.git] / src / core / uri.c
blob1cbf1f1405b04074c63bf28fede3f3032cc95785
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 FILE_LICENCE ( GPL2_OR_LATER );
21 /** @file
23 * Uniform Resource Identifiers
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <libgen.h>
31 #include <ctype.h>
32 #include <gpxe/vsprintf.h>
33 #include <gpxe/uri.h>
35 /**
36 * Dump URI for debugging
38 * @v uri URI
40 static void dump_uri ( struct uri *uri ) {
41 if ( ! uri )
42 return;
43 if ( uri->scheme )
44 DBG ( " scheme \"%s\"", uri->scheme );
45 if ( uri->opaque )
46 DBG ( " opaque \"%s\"", uri->opaque );
47 if ( uri->user )
48 DBG ( " user \"%s\"", uri->user );
49 if ( uri->password )
50 DBG ( " password \"%s\"", uri->password );
51 if ( uri->host )
52 DBG ( " host \"%s\"", uri->host );
53 if ( uri->port )
54 DBG ( " port \"%s\"", uri->port );
55 if ( uri->path )
56 DBG ( " path \"%s\"", uri->path );
57 if ( uri->query )
58 DBG ( " query \"%s\"", uri->query );
59 if ( uri->fragment )
60 DBG ( " fragment \"%s\"", uri->fragment );
63 /**
64 * Parse URI
66 * @v uri_string URI as a string
67 * @ret uri URI
69 * Splits a URI into its component parts. The return URI structure is
70 * dynamically allocated and must eventually be freed by calling
71 * uri_put().
73 struct uri * parse_uri ( const char *uri_string ) {
74 struct uri *uri;
75 char *raw;
76 char *tmp;
77 char *path = NULL;
78 char *authority = NULL;
79 int i;
80 size_t raw_len;
82 /* Allocate space for URI struct and a copy of the string */
83 raw_len = ( strlen ( uri_string ) + 1 /* NUL */ );
84 uri = zalloc ( sizeof ( *uri ) + raw_len );
85 if ( ! uri )
86 return NULL;
87 raw = ( ( ( char * ) uri ) + sizeof ( *uri ) );
89 /* Copy in the raw string */
90 memcpy ( raw, uri_string, raw_len );
92 /* Start by chopping off the fragment, if it exists */
93 if ( ( tmp = strchr ( raw, '#' ) ) ) {
94 *(tmp++) = '\0';
95 uri->fragment = tmp;
98 /* Identify absolute/relative URI. We ignore schemes that are
99 * apparently only a single character long, since otherwise we
100 * misinterpret a DOS-style path name ("C:\path\to\file") as a
101 * URI with scheme="C",opaque="\path\to\file".
103 if ( ( tmp = strchr ( raw, ':' ) ) && ( tmp > ( raw + 1 ) ) ) {
104 /* Absolute URI: identify hierarchical/opaque */
105 uri->scheme = raw;
106 *(tmp++) = '\0';
107 if ( *tmp == '/' ) {
108 /* Absolute URI with hierarchical part */
109 path = tmp;
110 } else {
111 /* Absolute URI with opaque part */
112 uri->opaque = tmp;
114 } else {
115 /* Relative URI */
116 path = raw;
119 /* If we don't have a path (i.e. we have an absolute URI with
120 * an opaque portion, we're already finished processing
122 if ( ! path )
123 goto done;
125 /* Chop off the query, if it exists */
126 if ( ( tmp = strchr ( path, '?' ) ) ) {
127 *(tmp++) = '\0';
128 uri->query = tmp;
131 /* Identify net/absolute/relative path */
132 if ( strncmp ( path, "//", 2 ) != 0 ) {
133 /* Absolute/relative path */
134 uri->path = path;
135 } else {
136 /* Net path. If this is terminated by the first '/'
137 * of an absolute path, then we have no space for a
138 * terminator after the authority field, so shuffle
139 * the authority down by one byte, overwriting one of
140 * the two slashes.
142 authority = ( path + 2 );
143 if ( ( tmp = strchr ( authority, '/' ) ) ) {
144 /* Shuffle down */
145 uri->path = tmp;
146 memmove ( ( authority - 1 ), authority,
147 ( tmp - authority ) );
148 authority--;
149 *(--tmp) = '\0';
152 /* Split authority into user[:password] and host[:port] portions */
153 if ( ( tmp = strchr ( authority, '@' ) ) ) {
154 /* Has user[:password] */
155 *(tmp++) = '\0';
156 uri->host = tmp;
157 uri->user = authority;
158 if ( ( tmp = strchr ( authority, ':' ) ) ) {
159 /* Has password */
160 *(tmp++) = '\0';
161 uri->password = tmp;
163 } else {
164 /* No user:password */
165 uri->host = authority;
168 /* Split host into host[:port] */
169 if ( ( tmp = strchr ( uri->host, ':' ) ) ) {
170 /* Make sure an IPv6 address isn't broken up. */
171 if ( ( strchr ( uri->host, '[' ) == 0 ) ||
172 ( tmp > strchr ( uri->host, ']' ) ) ) {
173 *(tmp++) = '\0';
174 uri->port = tmp;
178 /* Handle IPv6 case. */
179 if ( ( uri->host <= strchr ( uri->host, '[' ) ) &&
180 ( tmp = strchr ( uri->host, ']' ) ) ) {
181 uri->host++;
182 *(tmp) = 0;
186 /* Decode fields that should be decoded */
187 for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) {
188 const char *field = uri_get_field ( uri, i );
189 if ( field && ( URI_ENCODED & ( 1 << i ) ) )
190 uri_decode ( field, ( char * ) field,
191 strlen ( field ) + 1 /* NUL */ );
194 done:
195 DBG ( "URI \"%s\" split into", uri_string );
196 dump_uri ( uri );
197 DBG ( "\n" );
199 return uri;
203 * Get port from URI
205 * @v uri URI, or NULL
206 * @v default_port Default port to use if none specified in URI
207 * @ret port Port
209 unsigned int uri_port ( struct uri *uri, unsigned int default_port ) {
210 if ( ( ! uri ) || ( ! uri->port ) )
211 return default_port;
212 return ( strtoul ( uri->port, NULL, 0 ) );
216 * Unparse URI
218 * @v buf Buffer to fill with URI string
219 * @v size Size of buffer
220 * @v uri URI to write into buffer, or NULL
221 * @v fields Bitmask of fields to include in URI string, or URI_ALL
222 * @ret len Length of URI string
224 int unparse_uri ( char *buf, size_t size, struct uri *uri,
225 unsigned int fields ) {
226 /* List of characters that typically go before certain fields */
227 static char separators[] = { /* scheme */ 0, /* opaque */ ':',
228 /* user */ 0, /* password */ ':',
229 /* host */ '@', /* port */ ':',
230 /* path */ 0, /* query */ '?',
231 /* fragment */ '#' };
232 int used = 0;
233 int i;
235 DBG ( "URI unparsing" );
236 dump_uri ( uri );
237 DBG ( "\n" );
239 /* Ensure buffer is NUL-terminated */
240 if ( size )
241 buf[0] = '\0';
243 /* Special-case NULL URI */
244 if ( ! uri )
245 return 0;
247 /* Iterate through requested fields */
248 for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) {
249 const char *field = uri_get_field ( uri, i );
250 char sep = separators[i];
252 /* Ensure `fields' only contains bits for fields that exist */
253 if ( ! field )
254 fields &= ~( 1 << i );
256 /* Store this field if we were asked to */
257 if ( fields & ( 1 << i ) ) {
258 /* Print :// if we're non-opaque and had a scheme */
259 if ( ( fields & URI_SCHEME_BIT ) &&
260 ( i > URI_OPAQUE ) ) {
261 used += ssnprintf ( buf + used, size - used,
262 "://" );
263 /* Only print :// once */
264 fields &= ~URI_SCHEME_BIT;
267 /* Only print separator if an earlier field exists */
268 if ( sep && ( fields & ( ( 1 << i ) - 1 ) ) )
269 used += ssnprintf ( buf + used, size - used,
270 "%c", sep );
272 /* Print contents of field, possibly encoded */
273 if ( URI_ENCODED & ( 1 << i ) )
274 used += uri_encode ( field, buf + used,
275 size - used, i );
276 else
277 used += ssnprintf ( buf + used, size - used,
278 "%s", field );
282 return used;
286 * Duplicate URI
288 * @v uri URI
289 * @ret uri Duplicate URI
291 * Creates a modifiable copy of a URI.
293 struct uri * uri_dup ( struct uri *uri ) {
294 size_t len = ( unparse_uri ( NULL, 0, uri, URI_ALL ) + 1 );
295 char buf[len];
297 unparse_uri ( buf, len, uri, URI_ALL );
298 return parse_uri ( buf );
302 * Resolve base+relative path
304 * @v base_uri Base path
305 * @v relative_uri Relative path
306 * @ret resolved_uri Resolved path
308 * Takes a base path (e.g. "/var/lib/tftpboot/vmlinuz" and a relative
309 * path (e.g. "initrd.gz") and produces a new path
310 * (e.g. "/var/lib/tftpboot/initrd.gz"). Note that any non-directory
311 * portion of the base path will automatically be stripped; this
312 * matches the semantics used when resolving the path component of
313 * URIs.
315 char * resolve_path ( const char *base_path,
316 const char *relative_path ) {
317 size_t base_len = ( strlen ( base_path ) + 1 );
318 char base_path_copy[base_len];
319 char *base_tmp = base_path_copy;
320 char *resolved;
322 /* If relative path is absolute, just re-use it */
323 if ( relative_path[0] == '/' )
324 return strdup ( relative_path );
326 /* Create modifiable copy of path for dirname() */
327 memcpy ( base_tmp, base_path, base_len );
328 base_tmp = dirname ( base_tmp );
330 /* Process "./" and "../" elements */
331 while ( *relative_path == '.' ) {
332 relative_path++;
333 if ( *relative_path == 0 ) {
334 /* Do nothing */
335 } else if ( *relative_path == '/' ) {
336 relative_path++;
337 } else if ( *relative_path == '.' ) {
338 relative_path++;
339 if ( *relative_path == 0 ) {
340 base_tmp = dirname ( base_tmp );
341 } else if ( *relative_path == '/' ) {
342 base_tmp = dirname ( base_tmp );
343 relative_path++;
344 } else {
345 relative_path -= 2;
346 break;
348 } else {
349 relative_path--;
350 break;
354 /* Create and return new path */
355 if ( asprintf ( &resolved, "%s%s%s", base_tmp,
356 ( ( base_tmp[ strlen ( base_tmp ) - 1 ] == '/' ) ?
357 "" : "/" ), relative_path ) < 0 )
358 return NULL;
360 return resolved;
364 * Resolve base+relative URI
366 * @v base_uri Base URI, or NULL
367 * @v relative_uri Relative URI
368 * @ret resolved_uri Resolved URI
370 * Takes a base URI (e.g. "http://etherboot.org/kernels/vmlinuz" and a
371 * relative URI (e.g. "../initrds/initrd.gz") and produces a new URI
372 * (e.g. "http://etherboot.org/initrds/initrd.gz").
374 struct uri * resolve_uri ( struct uri *base_uri,
375 struct uri *relative_uri ) {
376 struct uri tmp_uri;
377 char *tmp_path = NULL;
378 struct uri *new_uri;
380 /* If relative URI is absolute, just re-use it */
381 if ( uri_is_absolute ( relative_uri ) || ( ! base_uri ) )
382 return uri_get ( relative_uri );
384 /* Mangle URI */
385 memcpy ( &tmp_uri, base_uri, sizeof ( tmp_uri ) );
386 if ( relative_uri->path ) {
387 tmp_path = resolve_path ( ( base_uri->path ?
388 base_uri->path : "/" ),
389 relative_uri->path );
390 tmp_uri.path = tmp_path;
391 tmp_uri.query = relative_uri->query;
392 tmp_uri.fragment = relative_uri->fragment;
393 } else if ( relative_uri->query ) {
394 tmp_uri.query = relative_uri->query;
395 tmp_uri.fragment = relative_uri->fragment;
396 } else if ( relative_uri->fragment ) {
397 tmp_uri.fragment = relative_uri->fragment;
400 /* Create demangled URI */
401 new_uri = uri_dup ( &tmp_uri );
402 free ( tmp_path );
403 return new_uri;
407 * Test for unreserved URI characters
409 * @v c Character to test
410 * @v field Field of URI in which character lies
411 * @ret is_unreserved Character is an unreserved character
413 static int is_unreserved_uri_char ( int c, int field ) {
414 /* According to RFC3986, the unreserved character set is
416 * A-Z a-z 0-9 - _ . ~
418 * but we also pass & ; = in queries, / in paths,
419 * and everything in opaques
421 int ok = ( isupper ( c ) || islower ( c ) || isdigit ( c ) ||
422 ( c == '-' ) || ( c == '_' ) ||
423 ( c == '.' ) || ( c == '~' ) );
425 /* : is valid for an IPv6 host address */
426 if ( field == URI_HOST )
427 ok = ok || (c == ':');
429 if ( field == URI_QUERY )
430 ok = ok || ( c == ';' ) || ( c == '&' ) || ( c == '=' );
432 if ( field == URI_PATH )
433 ok = ok || ( c == '/' );
435 if ( field == URI_OPAQUE )
436 ok = 1;
438 return ok;
442 * URI-encode string
444 * @v raw_string String to be URI-encoded
445 * @v buf Buffer to contain encoded string
446 * @v len Length of buffer
447 * @v field Field of URI in which string lies
448 * @ret len Length of encoded string (excluding NUL)
450 size_t uri_encode ( const char *raw_string, char *buf, ssize_t len,
451 int field ) {
452 ssize_t remaining = len;
453 size_t used;
454 unsigned char c;
456 if ( len > 0 )
457 buf[0] = '\0';
459 while ( ( c = *(raw_string++) ) ) {
460 if ( is_unreserved_uri_char ( c, field ) ) {
461 used = ssnprintf ( buf, remaining, "%c", c );
462 } else {
463 used = ssnprintf ( buf, remaining, "%%%02X", c );
465 buf += used;
466 remaining -= used;
469 return ( len - remaining );
473 * Decode URI-encoded string
475 * @v encoded_string URI-encoded string
476 * @v buf Buffer to contain decoded string
477 * @v len Length of buffer
478 * @ret len Length of decoded string (excluding NUL)
480 * This function may be used in-place, with @a buf the same as
481 * @a encoded_string.
483 size_t uri_decode ( const char *encoded_string, char *buf, ssize_t len ) {
484 ssize_t remaining;
485 char hexbuf[3];
486 char *hexbuf_end;
487 unsigned char c;
489 for ( remaining = len; *encoded_string; remaining-- ) {
490 if ( *encoded_string == '%' ) {
491 encoded_string++;
492 snprintf ( hexbuf, sizeof ( hexbuf ), "%s",
493 encoded_string );
494 c = strtoul ( hexbuf, &hexbuf_end, 16 );
495 encoded_string += ( hexbuf_end - hexbuf );
496 } else {
497 c = *(encoded_string++);
499 if ( remaining > 1 )
500 *buf++ = c;
503 if ( len )
504 *buf = 0;
506 return ( len - remaining );