src/core/uri.c

   1 /*
   2  * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License as
   6  * published by the Free Software Foundation; either version 2 of the
   7  * License, or any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 FILE_LICENCE ( GPL2_OR_LATER );
  20
  21 /** @file
  22  *
  23  * Uniform Resource Identifiers
  24  *
  25  */
  26
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <libgen.h>
  31 #include <ctype.h>
  32 #include <gpxe/vsprintf.h>
  33 #include <gpxe/uri.h>
  34
  35 /**
  36  * Dump URI for debugging
  37  *
  38  * @v uri               URI
  39  */
  40 static void dump_uri ( struct uri *uri ) {
  41         if ( ! uri )
  42                 return;
  43         if ( uri->scheme )
  44                 DBG ( " scheme \"%s\"", uri->scheme );
  45         if ( uri->opaque )
  46                 DBG ( " opaque \"%s\"", uri->opaque );
  47         if ( uri->user )
  48                 DBG ( " user \"%s\"", uri->user );
  49         if ( uri->password )
  50                 DBG ( " password \"%s\"", uri->password );
  51         if ( uri->host )
  52                 DBG ( " host \"%s\"", uri->host );
  53         if ( uri->port )
  54                 DBG ( " port \"%s\"", uri->port );
  55         if ( uri->path )
  56                 DBG ( " path \"%s\"", uri->path );
  57         if ( uri->query )
  58                 DBG ( " query \"%s\"", uri->query );
  59         if ( uri->fragment )
  60                 DBG ( " fragment \"%s\"", uri->fragment );
  61 }
  62
  63 /**
  64  * Parse URI
  65  *
  66  * @v uri_string        URI as a string
  67  * @ret uri             URI
  68  *
  69  * Splits a URI into its component parts.  The return URI structure is
  70  * dynamically allocated and must eventually be freed by calling
  71  * uri_put().
  72  */
  73 struct uri * parse_uri ( const char *uri_string ) {
  74         struct uri *uri;
  75         char *raw;
  76         char *tmp;
  77         char *path = NULL;
  78         char *authority = NULL;
  79         int i;
  80         size_t raw_len;
  81
  82         /* Allocate space for URI struct and a copy of the string */
  83         raw_len = ( strlen ( uri_string ) + 1 /* NUL */ );
  84         uri = zalloc ( sizeof ( *uri ) + raw_len );
  85         if ( ! uri )
  86                 return NULL;
  87         raw = ( ( ( char * ) uri ) + sizeof ( *uri ) );
  88
  89         /* Copy in the raw string */
  90         memcpy ( raw, uri_string, raw_len );
  91
  92         /* Start by chopping off the fragment, if it exists */
  93         if ( ( tmp = strchr ( raw, '#' ) ) ) {
  94                 *(tmp++) = '\0';
  95                 uri->fragment = tmp;
  96         }
  97
  98         /* Identify absolute/relative URI.  We ignore schemes that are
  99          * apparently only a single character long, since otherwise we
 100          * misinterpret a DOS-style path name ("C:\path\to\file") as a
 101          * URI with scheme="C",opaque="\path\to\file".
 102          */
 103         if ( ( tmp = strchr ( raw, ':' ) ) && ( tmp > ( raw + 1 ) ) ) {
 104                 /* Absolute URI: identify hierarchical/opaque */
 105                 uri->scheme = raw;
 106                 *(tmp++) = '\0';
 107                 if ( *tmp == '/' ) {
 108                         /* Absolute URI with hierarchical part */
 109                         path = tmp;
 110                 } else {
 111                         /* Absolute URI with opaque part */
 112                         uri->opaque = tmp;
 113                 }
 114         } else {
 115                 /* Relative URI */
 116                 path = raw;
 117         }
 118
 119         /* If we don't have a path (i.e. we have an absolute URI with
 120          * an opaque portion, we're already finished processing
 121          */
 122         if ( ! path )
 123                 goto done;
 124
 125         /* Chop off the query, if it exists */
 126         if ( ( tmp = strchr ( path, '?' ) ) ) {
 127                 *(tmp++) = '\0';
 128                 uri->query = tmp;
 129         }
 130
 131         /* Identify net/absolute/relative path */
 132         if ( strncmp ( path, "//", 2 ) != 0 ) {
 133                 /* Absolute/relative path */
 134                 uri->path = path;
 135         } else {
 136                 /* Net path.  If this is terminated by the first '/'
 137                  * of an absolute path, then we have no space for a
 138                  * terminator after the authority field, so shuffle
 139                  * the authority down by one byte, overwriting one of
 140                  * the two slashes.
 141                  */
 142                 authority = ( path + 2 );
 143                 if ( ( tmp = strchr ( authority, '/' ) ) ) {
 144                         /* Shuffle down */
 145                         uri->path = tmp;
 146                         memmove ( ( authority - 1 ), authority,
 147                                   ( tmp - authority ) );
 148                         authority--;
 149                         *(--tmp) = '\0';
 150                 }
 151
 152                 /* Split authority into user[:password] and host[:port] portions */
 153                 if ( ( tmp = strchr ( authority, '@' ) ) ) {
 154                         /* Has user[:password] */
 155                         *(tmp++) = '\0';
 156                         uri->host = tmp;
 157                         uri->user = authority;
 158                         if ( ( tmp = strchr ( authority, ':' ) ) ) {
 159                                 /* Has password */
 160                                 *(tmp++) = '\0';
 161                                 uri->password = tmp;
 162                         }
 163                 } else {
 164                         /* No user:password */
 165                         uri->host = authority;
 166                 }
 167
 168                 /* Split host into host[:port] */
 169                 if ( ( tmp = strchr ( uri->host, ':' ) ) ) {
 170                         /* Make sure an IPv6 address isn't broken up. */
 171                         if ( ( strchr ( uri->host, '[' ) == 0 ) ||
 172                              ( tmp > strchr ( uri->host, ']' ) ) ) {
 173                                 *(tmp++) = '\0';
 174                                 uri->port = tmp;
 175                         }
 176                 }
 177
 178                 /* Handle IPv6 case. */
 179                 if ( ( uri->host <= strchr ( uri->host, '[' ) ) &&
 180                      ( tmp = strchr ( uri->host, ']' ) ) ) {
 181                         uri->host++;
 182                         *(tmp) = 0;
 183                 }
 184         }
 185
 186         /* Decode fields that should be decoded */
 187         for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) {
 188                 const char *field = uri_get_field ( uri, i );
 189                 if ( field && ( URI_ENCODED & ( 1 << i ) ) )
 190                         uri_decode ( field, ( char * ) field,
 191                                      strlen ( field ) + 1 /* NUL */ );
 192         }
 193
 194  done:
 195         DBG ( "URI \"%s\" split into", uri_string );
 196         dump_uri ( uri );
 197         DBG ( "\n" );
 198
 199         return uri;
 200 }
 201
 202 /**
 203  * Get port from URI
 204  *
 205  * @v uri               URI, or NULL
 206  * @v default_port      Default port to use if none specified in URI
 207  * @ret port            Port
 208  */
 209 unsigned int uri_port ( struct uri *uri, unsigned int default_port ) {
 210         if ( ( ! uri ) || ( ! uri->port ) )
 211                 return default_port;
 212         return ( strtoul ( uri->port, NULL, 0 ) );
 213 }
 214
 215 /**
 216  * Unparse URI
 217  *
 218  * @v buf               Buffer to fill with URI string
 219  * @v size              Size of buffer
 220  * @v uri               URI to write into buffer, or NULL
 221  * @v fields            Bitmask of fields to include in URI string, or URI_ALL
 222  * @ret len             Length of URI string
 223  */
 224 int unparse_uri ( char *buf, size_t size, struct uri *uri,
 225                   unsigned int fields ) {
 226         /* List of characters that typically go before certain fields */
 227         static char separators[] = { /* scheme */ 0, /* opaque */ ':',
 228                                      /* user */ 0, /* password */ ':',
 229                                      /* host */ '@', /* port */ ':',
 230                                      /* path */ 0, /* query */ '?',
 231                                      /* fragment */ '#' };
 232         int used = 0;
 233         int i;
 234
 235         DBG ( "URI unparsing" );
 236         dump_uri ( uri );
 237         DBG ( "\n" );
 238
 239         /* Ensure buffer is NUL-terminated */
 240         if ( size )
 241                 buf[0] = '\0';
 242
 243         /* Special-case NULL URI */
 244         if ( ! uri )
 245                 return 0;
 246
 247         /* Iterate through requested fields */
 248         for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) {
 249                 const char *field = uri_get_field ( uri, i );
 250                 char sep = separators[i];
 251
 252                 /* Ensure `fields' only contains bits for fields that exist */
 253                 if ( ! field )
 254                         fields &= ~( 1 << i );
 255
 256                 /* Store this field if we were asked to */
 257                 if ( fields & ( 1 << i ) ) {
 258                         /* Print :// if we're non-opaque and had a scheme */
 259                         if ( ( fields & URI_SCHEME_BIT ) &&
 260                              ( i > URI_OPAQUE ) ) {
 261                                 used += ssnprintf ( buf + used, size - used,
 262                                                     "://" );
 263                                 /* Only print :// once */
 264                                 fields &= ~URI_SCHEME_BIT;
 265                         }
 266
 267                         /* Only print separator if an earlier field exists */
 268                         if ( sep && ( fields & ( ( 1 << i ) - 1 ) ) )
 269                                 used += ssnprintf ( buf + used, size - used,
 270                                                     "%c", sep );
 271
 272                         /* Print contents of field, possibly encoded */
 273                         if ( URI_ENCODED & ( 1 << i ) )
 274                                 used += uri_encode ( field, buf + used,
 275                                                      size - used, i );
 276                         else
 277                                 used += ssnprintf ( buf + used, size - used,
 278                                                     "%s", field );
 279                 }
 280         }
 281
 282         return used;
 283 }
 284
 285 /**
 286  * Duplicate URI
 287  *
 288  * @v uri               URI
 289  * @ret uri             Duplicate URI
 290  *
 291  * Creates a modifiable copy of a URI.
 292  */
 293 struct uri * uri_dup ( struct uri *uri ) {
 294         size_t len = ( unparse_uri ( NULL, 0, uri, URI_ALL ) + 1 );
 295         char buf[len];
 296
 297         unparse_uri ( buf, len, uri, URI_ALL );
 298         return parse_uri ( buf );
 299 }
 300
 301 /**
 302  * Resolve base+relative path
 303  *
 304  * @v base_uri          Base path
 305  * @v relative_uri      Relative path
 306  * @ret resolved_uri    Resolved path
 307  *
 308  * Takes a base path (e.g. "/var/lib/tftpboot/vmlinuz" and a relative
 309  * path (e.g. "initrd.gz") and produces a new path
 310  * (e.g. "/var/lib/tftpboot/initrd.gz").  Note that any non-directory
 311  * portion of the base path will automatically be stripped; this
 312  * matches the semantics used when resolving the path component of
 313  * URIs.
 314  */
 315 char * resolve_path ( const char *base_path,
 316                       const char *relative_path ) {
 317         size_t base_len = ( strlen ( base_path ) + 1 );
 318         char base_path_copy[base_len];
 319         char *base_tmp = base_path_copy;
 320         char *resolved;
 321
 322         /* If relative path is absolute, just re-use it */
 323         if ( relative_path[0] == '/' )
 324                 return strdup ( relative_path );
 325
 326         /* Create modifiable copy of path for dirname() */
 327         memcpy ( base_tmp, base_path, base_len );
 328         base_tmp = dirname ( base_tmp );
 329
 330         /* Process "./" and "../" elements */
 331         while ( *relative_path == '.' ) {
 332                 relative_path++;
 333                 if ( *relative_path == 0 ) {
 334                         /* Do nothing */
 335                 } else if ( *relative_path == '/' ) {
 336                         relative_path++;
 337                 } else if ( *relative_path == '.' ) {
 338                         relative_path++;
 339                         if ( *relative_path == 0 ) {
 340                                 base_tmp = dirname ( base_tmp );
 341                         } else if ( *relative_path == '/' ) {
 342                                 base_tmp = dirname ( base_tmp );
 343                                 relative_path++;
 344                         } else {
 345                                 relative_path -= 2;
 346                                 break;
 347                         }
 348                 } else {
 349                         relative_path--;
 350                         break;
 351                 }
 352         }
 353
 354         /* Create and return new path */
 355         if ( asprintf ( &resolved, "%s%s%s", base_tmp,
 356                         ( ( base_tmp[ strlen ( base_tmp ) - 1 ] == '/' ) ?
 357                           "" : "/" ), relative_path ) < 0 )
 358                 return NULL;
 359
 360         return resolved;
 361 }
 362
 363 /**
 364  * Resolve base+relative URI
 365  *
 366  * @v base_uri          Base URI, or NULL
 367  * @v relative_uri      Relative URI
 368  * @ret resolved_uri    Resolved URI
 369  *
 370  * Takes a base URI (e.g. "http://etherboot.org/kernels/vmlinuz" and a
 371  * relative URI (e.g. "../initrds/initrd.gz") and produces a new URI
 372  * (e.g. "http://etherboot.org/initrds/initrd.gz").
 373  */
 374 struct uri * resolve_uri ( struct uri *base_uri,
 375                            struct uri *relative_uri ) {
 376         struct uri tmp_uri;
 377         char *tmp_path = NULL;
 378         struct uri *new_uri;
 379
 380         /* If relative URI is absolute, just re-use it */
 381         if ( uri_is_absolute ( relative_uri ) || ( ! base_uri ) )
 382                 return uri_get ( relative_uri );
 383
 384         /* Mangle URI */
 385         memcpy ( &tmp_uri, base_uri, sizeof ( tmp_uri ) );
 386         if ( relative_uri->path ) {
 387                 tmp_path = resolve_path ( ( base_uri->path ?
 388                                             base_uri->path : "/" ),
 389                                           relative_uri->path );
 390                 tmp_uri.path = tmp_path;
 391                 tmp_uri.query = relative_uri->query;
 392                 tmp_uri.fragment = relative_uri->fragment;
 393         } else if ( relative_uri->query ) {
 394                 tmp_uri.query = relative_uri->query;
 395                 tmp_uri.fragment = relative_uri->fragment;
 396         } else if ( relative_uri->fragment ) {
 397                 tmp_uri.fragment = relative_uri->fragment;
 398         }
 399
 400         /* Create demangled URI */
 401         new_uri = uri_dup ( &tmp_uri );
 402         free ( tmp_path );
 403         return new_uri;
 404 }
 405
 406 /**
 407  * Test for unreserved URI characters
 408  *
 409  * @v c                 Character to test
 410  * @v field             Field of URI in which character lies
 411  * @ret is_unreserved   Character is an unreserved character
 412  */
 413 static int is_unreserved_uri_char ( int c, int field ) {
 414         /* According to RFC3986, the unreserved character set is
 415          *
 416          * A-Z a-z 0-9 - _ . ~
 417          *
 418          * but we also pass & ; = in queries, / in paths,
 419          * and everything in opaques
 420          */
 421         int ok = ( isupper ( c ) || islower ( c ) || isdigit ( c ) ||
 422                     ( c == '-' ) || ( c == '_' ) ||
 423                     ( c == '.' ) || ( c == '~' ) );
 424
 425         /* : is valid for an IPv6 host address */
 426         if ( field == URI_HOST )
 427                 ok = ok || (c == ':');
 428
 429         if ( field == URI_QUERY )
 430                 ok = ok || ( c == ';' ) || ( c == '&' ) || ( c == '=' );
 431
 432         if ( field == URI_PATH )
 433                 ok = ok || ( c == '/' );
 434
 435         if ( field == URI_OPAQUE )
 436                 ok = 1;
 437
 438         return ok;
 439 }
 440
 441 /**
 442  * URI-encode string
 443  *
 444  * @v raw_string        String to be URI-encoded
 445  * @v buf               Buffer to contain encoded string
 446  * @v len               Length of buffer
 447  * @v field             Field of URI in which string lies
 448  * @ret len             Length of encoded string (excluding NUL)
 449  */
 450 size_t uri_encode ( const char *raw_string, char *buf, ssize_t len,
 451                     int field ) {
 452         ssize_t remaining = len;
 453         size_t used;
 454         unsigned char c;
 455
 456         if ( len > 0 )
 457                 buf[0] = '\0';
 458
 459         while ( ( c = *(raw_string++) ) ) {
 460                 if ( is_unreserved_uri_char ( c, field ) ) {
 461                         used = ssnprintf ( buf, remaining, "%c", c );
 462                 } else {
 463                         used = ssnprintf ( buf, remaining, "%%%02X", c );
 464                 }
 465                 buf += used;
 466                 remaining -= used;
 467         }
 468
 469         return ( len - remaining );
 470 }
 471
 472 /**
 473  * Decode URI-encoded string
 474  *
 475  * @v encoded_string    URI-encoded string
 476  * @v buf               Buffer to contain decoded string
 477  * @v len               Length of buffer
 478  * @ret len             Length of decoded string (excluding NUL)
 479  *
 480  * This function may be used in-place, with @a buf the same as
 481  * @a encoded_string.
 482  */
 483 size_t uri_decode ( const char *encoded_string, char *buf, ssize_t len ) {
 484         ssize_t remaining;
 485         char hexbuf[3];
 486         char *hexbuf_end;
 487         unsigned char c;
 488
 489         for ( remaining = len; *encoded_string; remaining-- ) {
 490                 if ( *encoded_string == '%' ) {
 491                         encoded_string++;
 492                         snprintf ( hexbuf, sizeof ( hexbuf ), "%s",
 493                                    encoded_string );
 494                         c = strtoul ( hexbuf, &hexbuf_end, 16 );
 495                         encoded_string += ( hexbuf_end - hexbuf );
 496                 } else {
 497                         c = *(encoded_string++);
 498                 }
 499                 if ( remaining > 1 )
 500                         *buf++ = c;
 501         }
 502
 503         if ( len )
 504                 *buf = 0;
 505
 506         return ( len - remaining );
 507 }