src/core/uri.c - ipxe - Git at Google

 /*
  * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2 of the
  * License, or any later version.
  *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  * 02110-1301, USA.
  *
  * You can also choose to distribute this program under the terms of
  * the Unmodified Binary Distribution Licence (as given in the file
  * COPYING.UBDL), provided that you have satisfied its requirements.
  */

 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

 /** @file
  *
  * Uniform Resource Identifiers
  *
  */

 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <libgen.h>
 #include <ctype.h>
 #include <ipxe/vsprintf.h>
 #include <ipxe/params.h>
 #include <ipxe/tcpip.h>
 #include <ipxe/uri.h>

 /**
  * Decode URI field
  *
  * @v encoded		Encoded field
  * @v buf		Data buffer
  * @v len		Length
  * @ret len		Length of data
  *
  * URI decoding can never increase the length of a string; we can
  * therefore safely decode in place.
  */
 size_t uri_decode ( const char *encoded, void *buf, size_t len ) {
 	uint8_t *out = buf;
 	unsigned int count = 0;
 	char hexbuf[3];
 	char *hexbuf_end;
 	char c;
 	char decoded;
 	unsigned int skip;

 	/* Copy string, decoding escaped characters as necessary */
 	while ( ( c = *(encoded++) ) ) {
 		if ( c == '%' ) {
 			snprintf ( hexbuf, sizeof ( hexbuf ), "%s", encoded );
 			decoded = strtoul ( hexbuf, &hexbuf_end, 16 );
 			skip = ( hexbuf_end - hexbuf );
 			encoded += skip;
 			if ( skip )
 				c = decoded;
 		}
 		if ( count < len )
 			out[count] = c;
 		count++;
 	}
 	return count;
 }

 /**
  * Decode URI field in-place
  *
  * @v uri		URI
  * @v field		URI field index
  */
 static void uri_decode_inplace ( struct uri *uri, unsigned int field ) {
 	const char *encoded = uri_field ( uri, field );
 	char *decoded = ( ( char * ) encoded );
 	size_t len;

 	/* Do nothing if field is not present */
 	if ( ! encoded )
 		return;

 	/* Decode field in place */
 	len = uri_decode ( encoded, decoded, strlen ( encoded ) );

 	/* Terminate decoded string */
 	decoded[len] = '\0';
 }

 /**
  * Check if character should be escaped within a URI field
  *
  * @v c			Character
  * @v field		URI field index
  * @ret escaped		Character should be escaped
  */
 static int uri_character_escaped ( char c, unsigned int field ) {

 	/* Non-printing characters and whitespace should always be
 	 * escaped, since they cannot sensibly be displayed as part of
 	 * a coherent URL string.  (This test also catches control
 	 * characters such as CR and LF, which could affect the
 	 * operation of line-based protocols such as HTTP.)
 	 *
 	 * We should also escape characters which would alter the
 	 * interpretation of the URL if not escaped, i.e. characters
 	 * which have significance to the URL parser.  We should not
 	 * blindly escape all such characters, because this would lead
 	 * to some very strange-looking URLs (e.g. if we were to
 	 * always escape '/' as "%2F" even within the URI path).
 	 *
 	 * We do not need to be perfect.  Our primary role is as a
 	 * consumer of URIs rather than a producer; the main situation
 	 * in which we produce a URI string is for display to a human
 	 * user, who can probably tolerate some variance from the
 	 * formal specification.  The only situation in which we
 	 * currently produce a URI string to be consumed by a computer
 	 * is when constructing an HTTP request URI, which contains
 	 * only the path and query fields.
 	 *
 	 * We can therefore sacrifice some correctness for the sake of
 	 * code size.  For example, colons within the URI host should
 	 * be escaped unless they form part of an IPv6 literal
 	 * address; doing this correctly would require the URI
 	 * formatter to be aware of whether or not the URI host
 	 * contained an IPv4 address, an IPv6 address, or a host name.
 	 * We choose to simplify and never escape colons within the
 	 * URI host field: in the event of a pathological hostname
 	 * containing colons, this could potentially produce a URI
 	 * string which could not be reparsed.
 	 *
 	 * After excluding non-printing characters, whitespace, and
 	 * '%', the full set of characters with significance to the
 	 * URL parser is "/#:@?".  We choose for each URI field which
 	 * of these require escaping in our use cases.
 	 *
 	 * For the scheme field (equivalently, if field is zero), we
 	 * escape anything that has significance not just for our URI
 	 * parser but for any other URI parsers (e.g. HTTP query
 	 * string parsers, which care about '=' and '&').
 	 */
 	static const char *escaped[URI_FIELDS] = {
 		/* Scheme or default: escape everything */
 		[URI_SCHEME]	= "/#:@?=&",
 		/* Opaque part: escape characters which would affect
 		 * the reparsing of the URI, allowing everything else
 		 * (e.g. ':', which will appear in iSCSI URIs).
 		 */
 		[URI_OPAQUE]	= "#",
 		/* User name: escape everything */
 		[URI_USER]	= "/#:@?",
 		/* Password: escape everything */
 		[URI_PASSWORD]	= "/#:@?",
 		/* Host name: escape everything except ':', which may
 		 * appear as part of an IPv6 literal address.
 		 */
 		[URI_HOST]	= "/#@?",
 		/* Port number: escape everything */
 		[URI_PORT]	= "/#:@?",
 		/* Path: escape everything except '/', which usually
 		 * appears within paths.
 		 */
 		[URI_PATH]	= "#:@?",
 		/* Query: escape everything except '/', which
 		 * sometimes appears within queries.
 		 */
 		[URI_QUERY]	= "#:@?",
 		/* Fragment: escape everything */
 		[URI_FRAGMENT]	= "/#:@?",
 	};

 	return ( /* Always escape non-printing characters and whitespace */
 		 ( ! isprint ( c ) ) || ( c == ' ' ) ||
 		 /* Always escape '%' */
 		 ( c == '%' ) ||
 		 /* Escape field-specific characters */
 		 strchr ( escaped[field], c ) );
 }

 /**
  * Encode URI field
  *
  * @v field		URI field index
  * @v raw		Raw data
  * @v raw_len		Length of raw data
  * @v buf		Buffer
  * @v len		Length of buffer
  * @ret len		Length of encoded string (excluding NUL)
  */
 size_t uri_encode ( unsigned int field, const void *raw, size_t raw_len,
 		    char *buf, ssize_t len ) {
 	const uint8_t *raw_bytes = ( ( const uint8_t * ) raw );
 	ssize_t remaining = len;
 	size_t used;
 	char c;

 	/* Ensure encoded string is NUL-terminated even if empty */
 	if ( len > 0 )
 		buf[0] = '\0';

 	/* Copy string, escaping as necessary */
 	while ( raw_len-- ) {
 		c = *(raw_bytes++);
 		if ( uri_character_escaped ( c, field ) ) {
 			used = ssnprintf ( buf, remaining, "%%%02X", c );
 		} else {
 			used = ssnprintf ( buf, remaining, "%c", c );
 		}
 		buf += used;
 		remaining -= used;
 	}

 	return ( len - remaining );
 }

 /**
  * Encode URI field string
  *
  * @v field		URI field index
  * @v string		String
  * @v buf		Buffer
  * @v len		Length of buffer
  * @ret len		Length of encoded string (excluding NUL)
  */
 size_t uri_encode_string ( unsigned int field, const char *string,
 			   char *buf, ssize_t len ) {

 	return uri_encode ( field, string, strlen ( string ), buf, len );
 }

 /**
  * Dump URI for debugging
  *
  * @v uri		URI
  */
 static void uri_dump ( const struct uri *uri ) {

 	if ( ! uri )
 		return;
 	if ( uri->scheme )
 		DBGC ( uri, " scheme \"%s\"", uri->scheme );
 	if ( uri->opaque )
 		DBGC ( uri, " opaque \"%s\"", uri->opaque );
 	if ( uri->user )
 		DBGC ( uri, " user \"%s\"", uri->user );
 	if ( uri->password )
 		DBGC ( uri, " password \"%s\"", uri->password );
 	if ( uri->host )
 		DBGC ( uri, " host \"%s\"", uri->host );
 	if ( uri->port )
 		DBGC ( uri, " port \"%s\"", uri->port );
 	if ( uri->path )
 		DBGC ( uri, " path \"%s\"", uri->path );
 	if ( uri->query )
 		DBGC ( uri, " query \"%s\"", uri->query );
 	if ( uri->fragment )
 		DBGC ( uri, " fragment \"%s\"", uri->fragment );
 	if ( uri->params )
 		DBGC ( uri, " params \"%s\"", uri->params->name );
 }

 /**
  * Free URI
  *
  * @v refcnt		Reference count
  */
 static void uri_free ( struct refcnt *refcnt ) {
 	struct uri *uri = container_of ( refcnt, struct uri, refcnt );

 	params_put ( uri->params );
 	free ( uri );
 }

 /**
  * Parse URI
  *
  * @v uri_string	URI as a string
  * @ret uri		URI
  *
  * Splits a URI into its component parts.  The return URI structure is
  * dynamically allocated and must eventually be freed by calling
  * uri_put().
  */
 struct uri * parse_uri ( const char *uri_string ) {
 	struct uri *uri;
 	struct parameters *params;
 	char *raw;
 	char *tmp;
 	char *path;
 	char *authority;
 	size_t raw_len;
 	unsigned int field;

 	/* Allocate space for URI struct and a copy of the string */
 	raw_len = ( strlen ( uri_string ) + 1 /* NUL */ );
 	uri = zalloc ( sizeof ( *uri ) + raw_len );
 	if ( ! uri )
 		return NULL;
 	ref_init ( &uri->refcnt, uri_free );
 	raw = ( ( ( void * ) uri ) + sizeof ( *uri ) );

 	/* Copy in the raw string */
 	memcpy ( raw, uri_string, raw_len );

 	/* Identify the parameter list, if present */
 	if ( ( tmp = strstr ( raw, "##params" ) ) ) {
 		*tmp = '\0';
 		tmp += 8 /* "##params" */;
 		params = find_parameters ( *tmp ? ( tmp + 1 ) : NULL );
 		if ( params ) {
 			uri->params = claim_parameters ( params );
 		} else {
 			/* Ignore non-existent submission blocks */
 		}
 	}

 	/* Chop off the fragment, if it exists */
 	if ( ( tmp = strchr ( raw, '#' ) ) ) {
 		*(tmp++) = '\0';
 		uri->fragment = tmp;
 	}

 	/* Identify absolute/relative URI */
 	if ( ( tmp = strchr ( raw, ':' ) ) ) {
 		/* Absolute URI: identify hierarchical/opaque */
 		uri->scheme = raw;
 		*(tmp++) = '\0';
 		if ( *tmp == '/' ) {
 			/* Absolute URI with hierarchical part */
 			path = tmp;
 		} else {
 			/* Absolute URI with opaque part */
 			uri->opaque = tmp;
 			path = NULL;
 		}
 	} else {
 		/* Relative URI */
 		path = raw;
 	}

 	/* If we don't have a path (i.e. we have an absolute URI with
 	 * an opaque portion, we're already finished processing
 	 */
 	if ( ! path )
 		goto done;

 	/* Chop off the query, if it exists */
 	if ( ( tmp = strchr ( path, '?' ) ) ) {
 		*(tmp++) = '\0';
 		uri->query = tmp;
 	}

 	/* If we have no path remaining, then we're already finished
 	 * processing.
 	 */
 	if ( ! path[0] )
 		goto done;

 	/* Identify net/absolute/relative path */
 	if ( uri->scheme && ( strncmp ( path, "//", 2 ) == 0 ) ) {
 		/* Net path.  If this is terminated by the first '/'
 		 * of an absolute path, then we have no space for a
 		 * terminator after the authority field, so shuffle
 		 * the authority down by one byte, overwriting one of
 		 * the two slashes.
 		 */
 		authority = ( path + 2 );
 		if ( ( tmp = strchr ( authority, '/' ) ) ) {
 			/* Shuffle down */
 			uri->path = tmp;
 			memmove ( ( authority - 1 ), authority,
 				  ( tmp - authority ) );
 			authority--;
 			*(--tmp) = '\0';
 		}
 	} else {
 		/* Absolute/relative path */
 		uri->path = path;
 		authority = NULL;
 	}

 	/* If we don't have an authority (i.e. we have a non-net
 	 * path), we're already finished processing
 	 */
 	if ( ! authority )
 		goto done;

 	/* Split authority into user[:password] and host[:port] portions */
 	if ( ( tmp = strchr ( authority, '@' ) ) ) {
 		/* Has user[:password] */
 		*(tmp++) = '\0';
 		uri->host = tmp;
 		uri->user = authority;
 		if ( ( tmp = strchr ( authority, ':' ) ) ) {
 			/* Has password */
 			*(tmp++) = '\0';
 			uri->password = tmp;
 		}
 	} else {
 		/* No user:password */
 		uri->host = authority;
 	}

 	/* Split host into host[:port] */
 	if ( ( tmp = strrchr ( uri->host, ':' ) ) &&
 	     ( uri->host[ strlen ( uri->host ) - 1 ] != ']' ) ) {
 		*(tmp++) = '\0';
 		uri->port = tmp;
 	}

  done:
 	/* Decode fields in-place */
 	for ( field = 0 ; field < URI_FIELDS ; field++ )
 		uri_decode_inplace ( uri, field );

 	DBGC ( uri, "URI parsed \"%s\" to", uri_string );
 	uri_dump ( uri );
 	DBGC ( uri, "\n" );

 	return uri;
 }

 /**
  * Get port from URI
  *
  * @v uri		URI, or NULL
  * @v default_port	Default port to use if none specified in URI
  * @ret port		Port
  */
 unsigned int uri_port ( const struct uri *uri, unsigned int default_port ) {

 	if ( ( ! uri ) || ( ! uri->port ) )
 		return default_port;

 	return ( strtoul ( uri->port, NULL, 0 ) );
 }

 /**
  * Format URI
  *
  * @v uri		URI
  * @v buf		Buffer to fill with URI string
  * @v size		Size of buffer
  * @ret len		Length of URI string
  */
 size_t format_uri ( const struct uri *uri, char *buf, size_t len ) {
 	static const char prefixes[URI_FIELDS] = {
 		[URI_PASSWORD] = ':',
 		[URI_PORT] = ':',
 		[URI_QUERY] = '?',
 		[URI_FRAGMENT] = '#',
 	};
 	char prefix;
 	size_t used = 0;
 	unsigned int field;

 	/* Ensure buffer is NUL-terminated */
 	if ( len )
 		buf[0] = '\0';

 	/* Special-case NULL URI */
 	if ( ! uri )
 		return 0;

 	/* Generate fields */
 	for ( field = 0 ; field < URI_FIELDS ; field++ ) {

 		/* Skip non-existent fields */
 		if ( ! uri_field ( uri, field ) )
 			continue;

 		/* Prefix this field, if applicable */
 		prefix = prefixes[field];
 		if ( ( field == URI_HOST ) && ( uri->user != NULL ) )
 			prefix = '@';
 		if ( prefix ) {
 			used += ssnprintf ( ( buf + used ), ( len - used ),
 					    "%c", prefix );
 		}

 		/* Encode this field */
 		used += uri_encode_string ( field, uri_field ( uri, field ),
 					    ( buf + used ), ( len - used ) );

 		/* Suffix this field, if applicable */
 		if ( field == URI_SCHEME ) {
 			used += ssnprintf ( ( buf + used ), ( len - used ),
 					    ":%s", ( uri->host ? "//" : "" ) );
 		}
 	}

 	if ( len ) {
 		DBGC ( uri, "URI formatted" );
 		uri_dump ( uri );
 		DBGC ( uri, " to \"%s%s\"\n", buf,
 		       ( ( used > len ) ? "<TRUNCATED>" : "" ) );
 	}

 	return used;
 }

 /**
  * Format URI
  *
  * @v uri		URI
  * @ret string		URI string, or NULL on failure
  *
  * The caller is responsible for eventually freeing the allocated
  * memory.
  */
 char * format_uri_alloc ( const struct uri *uri ) {
 	size_t len;
 	char *string;

 	len = ( format_uri ( uri, NULL, 0 ) + 1 /* NUL */ );
 	string = malloc ( len );
 	if ( string )
 		format_uri ( uri, string, len );
 	return string;
 }

 /**
  * Copy URI fields
  *
  * @v src		Source URI
  * @v dest		Destination URI, or NULL to calculate length
  * @ret len		Length of raw URI
  */
 static size_t uri_copy_fields ( const struct uri *src, struct uri *dest ) {
 	size_t len = sizeof ( *dest );
 	char *out = ( ( void * ) dest + len );
 	unsigned int field;
 	size_t field_len;

 	/* Copy existent fields */
 	for ( field = 0 ; field < URI_FIELDS ; field++ ) {

 		/* Skip non-existent fields */
 		if ( ! uri_field ( src, field ) )
 			continue;

 		/* Calculate field length */
 		field_len = ( strlen ( uri_field ( src, field ) )
 			      + 1 /* NUL */ );
 		len += field_len;

 		/* Copy field, if applicable */
 		if ( dest ) {
 			memcpy ( out, uri_field ( src, field ), field_len );
 			uri_field ( dest, field ) = out;
 			out += field_len;
 		}
 	}
 	return len;
 }

 /**
  * Duplicate URI
  *
  * @v uri		URI
  * @ret uri		Duplicate URI
  *
  * Creates a modifiable copy of a URI.
  */
 struct uri * uri_dup ( const struct uri *uri ) {
 	struct uri *dup;
 	size_t len;

 	/* Allocate new URI */
 	len = uri_copy_fields ( uri, NULL );
 	dup = zalloc ( len );
 	if ( ! dup )
 		return NULL;
 	ref_init ( &dup->refcnt, uri_free );

 	/* Copy fields */
 	uri_copy_fields ( uri, dup );

 	/* Copy parameters */
 	dup->params = params_get ( uri->params );

 	DBGC ( uri, "URI duplicated" );
 	uri_dump ( uri );
 	DBGC ( uri, "\n" );

 	return dup;
 }

 /**
  * Resolve base+relative path
  *
  * @v base_uri		Base path
  * @v relative_uri	Relative path
  * @ret resolved_uri	Resolved path, or NULL on failure
  *
  * Takes a base path (e.g. "/var/lib/tftpboot/vmlinuz" and a relative
  * path (e.g. "initrd.gz") and produces a new path
  * (e.g. "/var/lib/tftpboot/initrd.gz").  Note that any non-directory
  * portion of the base path will automatically be stripped; this
  * matches the semantics used when resolving the path component of
  * URIs.
  */
 char * resolve_path ( const char *base_path,
 		      const char *relative_path ) {
 	char *base_copy;
 	char *base_tmp;
 	char *resolved;

 	/* If relative path is absolute, just re-use it */
 	if ( relative_path[0] == '/' )
 		return strdup ( relative_path );

 	/* Create modifiable copy of path for dirname() */
 	base_copy = strdup ( base_path );
 	if ( ! base_copy )
 		return NULL;

 	/* Strip filename portion of base path */
 	base_tmp = dirname ( base_copy );

 	/* Process "./" and "../" elements */
 	while ( *relative_path == '.' ) {
 		relative_path++;
 		if ( *relative_path == 0 ) {
 			/* Do nothing */
 		} else if ( *relative_path == '/' ) {
 			relative_path++;
 		} else if ( *relative_path == '.' ) {
 			relative_path++;
 			if ( *relative_path == 0 ) {
 				base_tmp = dirname ( base_tmp );
 			} else if ( *relative_path == '/' ) {
 				base_tmp = dirname ( base_tmp );
 				relative_path++;
 			} else {
 				relative_path -= 2;
 				break;
 			}
 		} else {
 			relative_path--;
 			break;
 		}
 	}

 	/* Create and return new path */
 	if ( asprintf ( &resolved, "%s%s%s", base_tmp,
 			( ( base_tmp[ strlen ( base_tmp ) - 1 ] == '/' ) ?
 			  "" : "/" ), relative_path ) < 0 )
 		resolved = NULL;
 	free ( base_copy );
 	return resolved;
 }

 /**
  * Resolve base+relative URI
  *
  * @v base_uri		Base URI, or NULL
  * @v relative_uri	Relative URI
  * @ret resolved_uri	Resolved URI, or NULL on failure
  *
  * Takes a base URI (e.g. "http://ipxe.org/kernels/vmlinuz" and a
  * relative URI (e.g. "../initrds/initrd.gz") and produces a new URI
  * (e.g. "http://ipxe.org/initrds/initrd.gz").
  */
 struct uri * resolve_uri ( const struct uri *base_uri,
 			   struct uri *relative_uri ) {
 	struct uri tmp_uri;
 	char *tmp_path = NULL;
 	struct uri *new_uri;

 	/* If relative URI is absolute, just re-use it */
 	if ( uri_is_absolute ( relative_uri ) || ( ! base_uri ) )
 		return uri_get ( relative_uri );

 	/* Mangle URI */
 	memcpy ( &tmp_uri, base_uri, sizeof ( tmp_uri ) );
 	if ( relative_uri->path ) {
 		tmp_path = resolve_path ( ( base_uri->path ?
 					    base_uri->path : "/" ),
 					  relative_uri->path );
 		tmp_uri.path = tmp_path;
 		tmp_uri.query = relative_uri->query;
 		tmp_uri.fragment = relative_uri->fragment;
 		tmp_uri.params = relative_uri->params;
 	} else if ( relative_uri->query ) {
 		tmp_uri.query = relative_uri->query;
 		tmp_uri.fragment = relative_uri->fragment;
 		tmp_uri.params = relative_uri->params;
 	} else if ( relative_uri->fragment ) {
 		tmp_uri.fragment = relative_uri->fragment;
 		tmp_uri.params = relative_uri->params;
 	} else if ( relative_uri->params ) {
 		tmp_uri.params = relative_uri->params;
 	}

 	/* Create demangled URI */
 	new_uri = uri_dup ( &tmp_uri );
 	free ( tmp_path );
 	return new_uri;
 }

 /**
  * Construct TFTP URI from server address and filename
  *
  * @v sa_server		Server address
  * @v filename		Filename
  * @ret uri		URI, or NULL on failure
  */
 static struct uri * tftp_uri ( struct sockaddr *sa_server,
 			       const char *filename ) {
 	struct sockaddr_tcpip *st_server =
 		( ( struct sockaddr_tcpip * ) sa_server );
 	char buf[ 6 /* "65535" + NUL */ ];
 	char *path;
 	struct uri tmp;
 	struct uri *uri = NULL;

 	/* Initialise TFTP URI */
 	memset ( &tmp, 0, sizeof ( tmp ) );
 	tmp.scheme = "tftp";

 	/* Construct TFTP server address */
 	tmp.host = sock_ntoa ( sa_server );
 	if ( ! tmp.host )
 		goto err_host;

 	/* Construct TFTP server port, if applicable */
 	if ( st_server->st_port ) {
 		snprintf ( buf, sizeof ( buf ), "%d",
 			   ntohs ( st_server->st_port ) );
 		tmp.port = buf;
 	}

 	/* Construct TFTP path */
 	if ( asprintf ( &path, "/%s", filename ) < 0 )
 		goto err_path;
 	tmp.path = path;

 	/* Demangle URI */
 	uri = uri_dup ( &tmp );
 	if ( ! uri )
 		goto err_uri;

  err_uri:
 	free ( path );
  err_path:
  err_host:
 	return uri;
 }

 /**
  * Construct URI from server address and filename
  *
  * @v sa_server		Server address
  * @v filename		Filename
  * @ret uri		URI, or NULL on failure
  *
  * PXE TFTP filenames specified via the DHCP next-server field often
  * contain characters such as ':' or '#' which would confuse the
  * generic URI parser.  We provide a mechanism for directly
  * constructing a TFTP URI from the next-server and filename.
  */
 struct uri * pxe_uri ( struct sockaddr *sa_server, const char *filename ) {
 	struct uri *uri;

 	/* Fail if filename is empty */
 	if ( ! ( filename && filename[0] ) )
 		return NULL;

 	/* If filename is a hierarchical absolute URI, then use that
 	 * URI.  (We accept only hierarchical absolute URIs, since PXE
 	 * filenames sometimes start with DOS drive letters such as
 	 * "C:\", which get misinterpreted as opaque absolute URIs.)
 	 */
 	uri = parse_uri ( filename );
 	if ( uri && uri_is_absolute ( uri ) && ( ! uri->opaque ) )
 		return uri;
 	uri_put ( uri );

 	/* Otherwise, construct a TFTP URI directly */
 	return tftp_uri ( sa_server, filename );
 }
	/*
	* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public License as
	* published by the Free Software Foundation; either version 2 of the
	* License, or any later version.
	*
	* This program is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	* 02110-1301, USA.
	*
	* You can also choose to distribute this program under the terms of
	* the Unmodified Binary Distribution Licence (as given in the file
	* COPYING.UBDL), provided that you have satisfied its requirements.
	*/

	FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

	/** @file
	*
	* Uniform Resource Identifiers
	*
	*/

	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <libgen.h>
	#include <ctype.h>
	#include <ipxe/vsprintf.h>
	#include <ipxe/params.h>
	#include <ipxe/tcpip.h>
	#include <ipxe/uri.h>

	/**
	* Decode URI field
	*
	* @v encoded Encoded field
	* @v buf Data buffer
	* @v len Length
	* @ret len Length of data
	*
	* URI decoding can never increase the length of a string; we can
	* therefore safely decode in place.
	*/
	size_t uri_decode ( const char encoded, void buf, size_t len ) {
	uint8_t *out = buf;
	unsigned int count = 0;
	char hexbuf[3];
	char *hexbuf_end;
	char c;
	char decoded;
	unsigned int skip;

	/* Copy string, decoding escaped characters as necessary */
	while ( ( c = *(encoded++) ) ) {
	if ( c == '%' ) {
	snprintf ( hexbuf, sizeof ( hexbuf ), "%s", encoded );
	decoded = strtoul ( hexbuf, &hexbuf_end, 16 );
	skip = ( hexbuf_end - hexbuf );
	encoded += skip;
	if ( skip )
	c = decoded;
	}
	if ( count < len )
	out[count] = c;
	count++;
	}
	return count;
	}

	/**
	* Decode URI field in-place
	*
	* @v uri URI
	* @v field URI field index
	*/
	static void uri_decode_inplace ( struct uri *uri, unsigned int field ) {
	const char *encoded = uri_field ( uri, field );
	char decoded = ( ( char ) encoded );
	size_t len;

	/* Do nothing if field is not present */
	if ( ! encoded )
	return;

	/* Decode field in place */
	len = uri_decode ( encoded, decoded, strlen ( encoded ) );

	/* Terminate decoded string */
	decoded[len] = '\0';
	}

	/**
	* Check if character should be escaped within a URI field
	*
	* @v c Character
	* @v field URI field index
	* @ret escaped Character should be escaped
	*/
	static int uri_character_escaped ( char c, unsigned int field ) {

	/* Non-printing characters and whitespace should always be
	* escaped, since they cannot sensibly be displayed as part of
	* a coherent URL string. (This test also catches control
	* characters such as CR and LF, which could affect the
	* operation of line-based protocols such as HTTP.)
	*
	* We should also escape characters which would alter the
	* interpretation of the URL if not escaped, i.e. characters
	* which have significance to the URL parser. We should not
	* blindly escape all such characters, because this would lead
	* to some very strange-looking URLs (e.g. if we were to
	* always escape '/' as "%2F" even within the URI path).
	*
	* We do not need to be perfect. Our primary role is as a
	* consumer of URIs rather than a producer; the main situation
	* in which we produce a URI string is for display to a human
	* user, who can probably tolerate some variance from the
	* formal specification. The only situation in which we
	* currently produce a URI string to be consumed by a computer
	* is when constructing an HTTP request URI, which contains
	* only the path and query fields.
	*
	* We can therefore sacrifice some correctness for the sake of
	* code size. For example, colons within the URI host should
	* be escaped unless they form part of an IPv6 literal
	* address; doing this correctly would require the URI
	* formatter to be aware of whether or not the URI host
	* contained an IPv4 address, an IPv6 address, or a host name.
	* We choose to simplify and never escape colons within the
	* URI host field: in the event of a pathological hostname
	* containing colons, this could potentially produce a URI
	* string which could not be reparsed.
	*
	* After excluding non-printing characters, whitespace, and
	* '%', the full set of characters with significance to the
	* URL parser is "/#:@?". We choose for each URI field which
	* of these require escaping in our use cases.
	*
	* For the scheme field (equivalently, if field is zero), we
	* escape anything that has significance not just for our URI
	* parser but for any other URI parsers (e.g. HTTP query
	* string parsers, which care about '=' and '&').
	*/
	static const char *escaped[URI_FIELDS] = {
	/* Scheme or default: escape everything */
	[URI_SCHEME] = "/#:@?=&",
	/* Opaque part: escape characters which would affect
	* the reparsing of the URI, allowing everything else
	* (e.g. ':', which will appear in iSCSI URIs).
	*/
	[URI_OPAQUE] = "#",
	/* User name: escape everything */
	[URI_USER] = "/#:@?",
	/* Password: escape everything */
	[URI_PASSWORD] = "/#:@?",
	/* Host name: escape everything except ':', which may
	* appear as part of an IPv6 literal address.
	*/
	[URI_HOST] = "/#@?",
	/* Port number: escape everything */
	[URI_PORT] = "/#:@?",
	/* Path: escape everything except '/', which usually
	* appears within paths.
	*/
	[URI_PATH] = "#:@?",
	/* Query: escape everything except '/', which
	* sometimes appears within queries.
	*/
	[URI_QUERY] = "#:@?",
	/* Fragment: escape everything */
	[URI_FRAGMENT] = "/#:@?",
	};

	return ( /* Always escape non-printing characters and whitespace */
	( ! isprint ( c ) ) \|\| ( c == ' ' ) \|\|
	/* Always escape '%' */
	( c == '%' ) \|\|
	/* Escape field-specific characters */
	strchr ( escaped[field], c ) );
	}

	/**
	* Encode URI field
	*
	* @v field URI field index
	* @v raw Raw data
	* @v raw_len Length of raw data
	* @v buf Buffer
	* @v len Length of buffer
	* @ret len Length of encoded string (excluding NUL)
	*/
	size_t uri_encode ( unsigned int field, const void *raw, size_t raw_len,
	char *buf, ssize_t len ) {
	const uint8_t raw_bytes = ( ( const uint8_t ) raw );
	ssize_t remaining = len;
	size_t used;
	char c;

	/* Ensure encoded string is NUL-terminated even if empty */
	if ( len > 0 )
	buf[0] = '\0';

	/* Copy string, escaping as necessary */
	while ( raw_len-- ) {
	c = *(raw_bytes++);
	if ( uri_character_escaped ( c, field ) ) {
	used = ssnprintf ( buf, remaining, "%%%02X", c );
	} else {
	used = ssnprintf ( buf, remaining, "%c", c );
	}
	buf += used;
	remaining -= used;
	}

	return ( len - remaining );
	}

	/**
	* Encode URI field string
	*
	* @v field URI field index
	* @v string String
	* @v buf Buffer
	* @v len Length of buffer
	* @ret len Length of encoded string (excluding NUL)
	*/
	size_t uri_encode_string ( unsigned int field, const char *string,
	char *buf, ssize_t len ) {

	return uri_encode ( field, string, strlen ( string ), buf, len );
	}

	/**
	* Dump URI for debugging
	*
	* @v uri URI
	*/
	static void uri_dump ( const struct uri *uri ) {

	if ( ! uri )
	return;
	if ( uri->scheme )
	DBGC ( uri, " scheme \"%s\"", uri->scheme );
	if ( uri->opaque )
	DBGC ( uri, " opaque \"%s\"", uri->opaque );
	if ( uri->user )
	DBGC ( uri, " user \"%s\"", uri->user );
	if ( uri->password )
	DBGC ( uri, " password \"%s\"", uri->password );
	if ( uri->host )
	DBGC ( uri, " host \"%s\"", uri->host );
	if ( uri->port )
	DBGC ( uri, " port \"%s\"", uri->port );
	if ( uri->path )
	DBGC ( uri, " path \"%s\"", uri->path );
	if ( uri->query )
	DBGC ( uri, " query \"%s\"", uri->query );
	if ( uri->fragment )
	DBGC ( uri, " fragment \"%s\"", uri->fragment );
	if ( uri->params )
	DBGC ( uri, " params \"%s\"", uri->params->name );
	}

	/**
	* Free URI
	*
	* @v refcnt Reference count
	*/
	static void uri_free ( struct refcnt *refcnt ) {
	struct uri *uri = container_of ( refcnt, struct uri, refcnt );

	params_put ( uri->params );
	free ( uri );
	}

	/**
	* Parse URI
	*
	* @v uri_string URI as a string
	* @ret uri URI
	*
	* Splits a URI into its component parts. The return URI structure is
	* dynamically allocated and must eventually be freed by calling
	* uri_put().
	*/
	struct uri * parse_uri ( const char *uri_string ) {
	struct uri *uri;
	struct parameters *params;
	char *raw;
	char *tmp;
	char *path;
	char *authority;
	size_t raw_len;
	unsigned int field;

	/* Allocate space for URI struct and a copy of the string */
	raw_len = ( strlen ( uri_string ) + 1 /* NUL */ );
	uri = zalloc ( sizeof ( *uri ) + raw_len );
	if ( ! uri )
	return NULL;
	ref_init ( &uri->refcnt, uri_free );
	raw = ( ( ( void * ) uri ) + sizeof ( *uri ) );

	/* Copy in the raw string */
	memcpy ( raw, uri_string, raw_len );

	/* Identify the parameter list, if present */
	if ( ( tmp = strstr ( raw, "##params" ) ) ) {
	*tmp = '\0';
	tmp += 8 /* "##params" */;
	params = find_parameters ( *tmp ? ( tmp + 1 ) : NULL );
	if ( params ) {
	uri->params = claim_parameters ( params );
	} else {
	/* Ignore non-existent submission blocks */
	}
	}

	/* Chop off the fragment, if it exists */
	if ( ( tmp = strchr ( raw, '#' ) ) ) {
	*(tmp++) = '\0';
	uri->fragment = tmp;
	}

	/* Identify absolute/relative URI */
	if ( ( tmp = strchr ( raw, ':' ) ) ) {
	/* Absolute URI: identify hierarchical/opaque */
	uri->scheme = raw;
	*(tmp++) = '\0';
	if ( *tmp == '/' ) {
	/* Absolute URI with hierarchical part */
	path = tmp;
	} else {
	/* Absolute URI with opaque part */
	uri->opaque = tmp;
	path = NULL;
	}
	} else {
	/* Relative URI */
	path = raw;
	}

	/* If we don't have a path (i.e. we have an absolute URI with
	* an opaque portion, we're already finished processing
	*/
	if ( ! path )
	goto done;

	/* Chop off the query, if it exists */
	if ( ( tmp = strchr ( path, '?' ) ) ) {
	*(tmp++) = '\0';
	uri->query = tmp;
	}

	/* If we have no path remaining, then we're already finished
	* processing.
	*/
	if ( ! path[0] )
	goto done;

	/* Identify net/absolute/relative path */
	if ( uri->scheme && ( strncmp ( path, "//", 2 ) == 0 ) ) {
	/* Net path. If this is terminated by the first '/'
	* of an absolute path, then we have no space for a
	* terminator after the authority field, so shuffle
	* the authority down by one byte, overwriting one of
	* the two slashes.
	*/
	authority = ( path + 2 );
	if ( ( tmp = strchr ( authority, '/' ) ) ) {
	/* Shuffle down */
	uri->path = tmp;
	memmove ( ( authority - 1 ), authority,
	( tmp - authority ) );
	authority--;
	*(--tmp) = '\0';
	}
	} else {
	/* Absolute/relative path */
	uri->path = path;
	authority = NULL;
	}

	/* If we don't have an authority (i.e. we have a non-net
	* path), we're already finished processing
	*/
	if ( ! authority )
	goto done;

	/* Split authority into user[:password] and host[:port] portions */
	if ( ( tmp = strchr ( authority, '@' ) ) ) {
	/* Has user[:password] */
	*(tmp++) = '\0';
	uri->host = tmp;
	uri->user = authority;
	if ( ( tmp = strchr ( authority, ':' ) ) ) {
	/* Has password */
	*(tmp++) = '\0';
	uri->password = tmp;
	}
	} else {
	/* No user:password */
	uri->host = authority;
	}

	/* Split host into host[:port] */
	if ( ( tmp = strrchr ( uri->host, ':' ) ) &&
	( uri->host[ strlen ( uri->host ) - 1 ] != ']' ) ) {
	*(tmp++) = '\0';
	uri->port = tmp;
	}

	done:
	/* Decode fields in-place */
	for ( field = 0 ; field < URI_FIELDS ; field++ )
	uri_decode_inplace ( uri, field );

	DBGC ( uri, "URI parsed \"%s\" to", uri_string );
	uri_dump ( uri );
	DBGC ( uri, "\n" );

	return uri;
	}

	/**
	* Get port from URI
	*
	* @v uri URI, or NULL
	* @v default_port Default port to use if none specified in URI
	* @ret port Port
	*/
	unsigned int uri_port ( const struct uri *uri, unsigned int default_port ) {

	if ( ( ! uri ) \|\| ( ! uri->port ) )
	return default_port;

	return ( strtoul ( uri->port, NULL, 0 ) );
	}

	/**
	* Format URI
	*
	* @v uri URI
	* @v buf Buffer to fill with URI string
	* @v size Size of buffer
	* @ret len Length of URI string
	*/
	size_t format_uri ( const struct uri uri, char buf, size_t len ) {
	static const char prefixes[URI_FIELDS] = {
	[URI_PASSWORD] = ':',
	[URI_PORT] = ':',
	[URI_QUERY] = '?',
	[URI_FRAGMENT] = '#',
	};
	char prefix;
	size_t used = 0;
	unsigned int field;

	/* Ensure buffer is NUL-terminated */
	if ( len )
	buf[0] = '\0';

	/* Special-case NULL URI */
	if ( ! uri )
	return 0;

	/* Generate fields */
	for ( field = 0 ; field < URI_FIELDS ; field++ ) {

	/* Skip non-existent fields */
	if ( ! uri_field ( uri, field ) )
	continue;

	/* Prefix this field, if applicable */
	prefix = prefixes[field];
	if ( ( field == URI_HOST ) && ( uri->user != NULL ) )
	prefix = '@';
	if ( prefix ) {
	used += ssnprintf ( ( buf + used ), ( len - used ),
	"%c", prefix );
	}

	/* Encode this field */
	used += uri_encode_string ( field, uri_field ( uri, field ),
	( buf + used ), ( len - used ) );

	/* Suffix this field, if applicable */
	if ( field == URI_SCHEME ) {
	used += ssnprintf ( ( buf + used ), ( len - used ),
	":%s", ( uri->host ? "//" : "" ) );
	}
	}

	if ( len ) {
	DBGC ( uri, "URI formatted" );
	uri_dump ( uri );
	DBGC ( uri, " to \"%s%s\"\n", buf,
	( ( used > len ) ? "<TRUNCATED>" : "" ) );
	}

	return used;
	}

	/**
	* Format URI
	*
	* @v uri URI
	* @ret string URI string, or NULL on failure
	*
	* The caller is responsible for eventually freeing the allocated
	* memory.
	*/
	char * format_uri_alloc ( const struct uri *uri ) {
	size_t len;
	char *string;

	len = ( format_uri ( uri, NULL, 0 ) + 1 /* NUL */ );
	string = malloc ( len );
	if ( string )
	format_uri ( uri, string, len );
	return string;
	}

	/**
	* Copy URI fields
	*
	* @v src Source URI
	* @v dest Destination URI, or NULL to calculate length
	* @ret len Length of raw URI
	*/
	static size_t uri_copy_fields ( const struct uri src, struct uri dest ) {
	size_t len = sizeof ( *dest );
	char out = ( ( void ) dest + len );
	unsigned int field;
	size_t field_len;

	/* Copy existent fields */
	for ( field = 0 ; field < URI_FIELDS ; field++ ) {

	/* Skip non-existent fields */
	if ( ! uri_field ( src, field ) )
	continue;

	/* Calculate field length */
	field_len = ( strlen ( uri_field ( src, field ) )
	+ 1 /* NUL */ );
	len += field_len;

	/* Copy field, if applicable */
	if ( dest ) {
	memcpy ( out, uri_field ( src, field ), field_len );
	uri_field ( dest, field ) = out;
	out += field_len;
	}
	}
	return len;
	}

	/**
	* Duplicate URI
	*
	* @v uri URI
	* @ret uri Duplicate URI
	*
	* Creates a modifiable copy of a URI.
	*/
	struct uri * uri_dup ( const struct uri *uri ) {
	struct uri *dup;
	size_t len;

	/* Allocate new URI */
	len = uri_copy_fields ( uri, NULL );
	dup = zalloc ( len );
	if ( ! dup )
	return NULL;
	ref_init ( &dup->refcnt, uri_free );

	/* Copy fields */
	uri_copy_fields ( uri, dup );

	/* Copy parameters */
	dup->params = params_get ( uri->params );

	DBGC ( uri, "URI duplicated" );
	uri_dump ( uri );
	DBGC ( uri, "\n" );

	return dup;
	}

	/**
	* Resolve base+relative path
	*
	* @v base_uri Base path
	* @v relative_uri Relative path
	* @ret resolved_uri Resolved path, or NULL on failure
	*
	* Takes a base path (e.g. "/var/lib/tftpboot/vmlinuz" and a relative
	* path (e.g. "initrd.gz") and produces a new path
	* (e.g. "/var/lib/tftpboot/initrd.gz"). Note that any non-directory
	* portion of the base path will automatically be stripped; this
	* matches the semantics used when resolving the path component of
	* URIs.
	*/
	char * resolve_path ( const char *base_path,
	const char *relative_path ) {
	char *base_copy;
	char *base_tmp;
	char *resolved;

	/* If relative path is absolute, just re-use it */
	if ( relative_path[0] == '/' )
	return strdup ( relative_path );

	/* Create modifiable copy of path for dirname() */
	base_copy = strdup ( base_path );
	if ( ! base_copy )
	return NULL;

	/* Strip filename portion of base path */
	base_tmp = dirname ( base_copy );

	/* Process "./" and "../" elements */
	while ( *relative_path == '.' ) {
	relative_path++;
	if ( *relative_path == 0 ) {
	/* Do nothing */
	} else if ( *relative_path == '/' ) {
	relative_path++;
	} else if ( *relative_path == '.' ) {
	relative_path++;
	if ( *relative_path == 0 ) {
	base_tmp = dirname ( base_tmp );
	} else if ( *relative_path == '/' ) {
	base_tmp = dirname ( base_tmp );
	relative_path++;
	} else {
	relative_path -= 2;
	break;
	}
	} else {
	relative_path--;
	break;
	}
	}

	/* Create and return new path */
	if ( asprintf ( &resolved, "%s%s%s", base_tmp,
	( ( base_tmp[ strlen ( base_tmp ) - 1 ] == '/' ) ?
	"" : "/" ), relative_path ) < 0 )
	resolved = NULL;
	free ( base_copy );
	return resolved;
	}

	/**
	* Resolve base+relative URI
	*
	* @v base_uri Base URI, or NULL
	* @v relative_uri Relative URI
	* @ret resolved_uri Resolved URI, or NULL on failure
	*
	* Takes a base URI (e.g. "http://ipxe.org/kernels/vmlinuz" and a
	* relative URI (e.g. "../initrds/initrd.gz") and produces a new URI
	* (e.g. "http://ipxe.org/initrds/initrd.gz").
	*/
	struct uri * resolve_uri ( const struct uri *base_uri,
	struct uri *relative_uri ) {
	struct uri tmp_uri;
	char *tmp_path = NULL;
	struct uri *new_uri;

	/* If relative URI is absolute, just re-use it */
	if ( uri_is_absolute ( relative_uri ) \|\| ( ! base_uri ) )
	return uri_get ( relative_uri );

	/* Mangle URI */
	memcpy ( &tmp_uri, base_uri, sizeof ( tmp_uri ) );
	if ( relative_uri->path ) {
	tmp_path = resolve_path ( ( base_uri->path ?
	base_uri->path : "/" ),
	relative_uri->path );
	tmp_uri.path = tmp_path;
	tmp_uri.query = relative_uri->query;
	tmp_uri.fragment = relative_uri->fragment;
	tmp_uri.params = relative_uri->params;
	} else if ( relative_uri->query ) {
	tmp_uri.query = relative_uri->query;
	tmp_uri.fragment = relative_uri->fragment;
	tmp_uri.params = relative_uri->params;
	} else if ( relative_uri->fragment ) {
	tmp_uri.fragment = relative_uri->fragment;
	tmp_uri.params = relative_uri->params;
	} else if ( relative_uri->params ) {
	tmp_uri.params = relative_uri->params;
	}

	/* Create demangled URI */
	new_uri = uri_dup ( &tmp_uri );
	free ( tmp_path );
	return new_uri;
	}

	/**
	* Construct TFTP URI from server address and filename
	*
	* @v sa_server Server address
	* @v filename Filename
	* @ret uri URI, or NULL on failure
	*/
	static struct uri * tftp_uri ( struct sockaddr *sa_server,
	const char *filename ) {
	struct sockaddr_tcpip *st_server =
	( ( struct sockaddr_tcpip * ) sa_server );
	char buf[ 6 /* "65535" + NUL */ ];
	char *path;
	struct uri tmp;
	struct uri *uri = NULL;

	/* Initialise TFTP URI */
	memset ( &tmp, 0, sizeof ( tmp ) );
	tmp.scheme = "tftp";

	/* Construct TFTP server address */
	tmp.host = sock_ntoa ( sa_server );
	if ( ! tmp.host )
	goto err_host;

	/* Construct TFTP server port, if applicable */
	if ( st_server->st_port ) {
	snprintf ( buf, sizeof ( buf ), "%d",
	ntohs ( st_server->st_port ) );
	tmp.port = buf;
	}

	/* Construct TFTP path */
	if ( asprintf ( &path, "/%s", filename ) < 0 )
	goto err_path;
	tmp.path = path;

	/* Demangle URI */
	uri = uri_dup ( &tmp );
	if ( ! uri )
	goto err_uri;

	err_uri:
	free ( path );
	err_path:
	err_host:
	return uri;
	}

	/**
	* Construct URI from server address and filename
	*
	* @v sa_server Server address
	* @v filename Filename
	* @ret uri URI, or NULL on failure
	*
	* PXE TFTP filenames specified via the DHCP next-server field often
	* contain characters such as ':' or '#' which would confuse the
	* generic URI parser. We provide a mechanism for directly
	* constructing a TFTP URI from the next-server and filename.
	*/
	struct uri * pxe_uri ( struct sockaddr sa_server, const char filename ) {
	struct uri *uri;

	/* Fail if filename is empty */
	if ( ! ( filename && filename[0] ) )
	return NULL;

	/* If filename is a hierarchical absolute URI, then use that
	* URI. (We accept only hierarchical absolute URIs, since PXE
	* filenames sometimes start with DOS drive letters such as
	* "C:\", which get misinterpreted as opaque absolute URIs.)
	*/
	uri = parse_uri ( filename );
	if ( uri && uri_is_absolute ( uri ) && ( ! uri->opaque ) )
	return uri;
	uri_put ( uri );

	/* Otherwise, construct a TFTP URI directly */
	return tftp_uri ( sa_server, filename );
	}