[uri] Retain original encodings for path, query, and fragment fields

iPXE decodes any percent-encoded characters during the URI parsing
stage, thereby allowing protocol implementations to consume the raw
field values directly without further decoding.

When reconstructing a URI string for use in an HTTP request line, the
percent-encoding is currently reapplied in a reversible way: we
guarantee that our reconstructed URI string could be decoded to give
the same raw field values.

This technically violates RFC3986, which states that "URIs that differ
in the replacement of a reserved character with its corresponding
percent-encoded octet are not equivalent".  Experiments show that
several HTTP server applications will attach meaning to the choice of
whether or not a particular character was percent-encoded, even when
the percent-encoding is unnecessary from the perspective of parsing
the URI into its component fields.

Fix by storing the originally encoded substrings for the path, query,
and fragment fields and using these original encoded versions when
reconstructing a URI string.  The path field is also stored as a
decoded string, for use by protocols such as TFTP that communicate
using raw strings rather than URI-encoded strings.  All other fields
(such as the username and password) continue to be stored only in
their decoded versions since nothing ever needs to know the originally
encoded versions of these fields.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
diff --git a/src/core/uri.c b/src/core/uri.c
index e9e512a..a0f79e9 100644
--- a/src/core/uri.c
+++ b/src/core/uri.c
@@ -79,12 +79,10 @@
 /**
  * Decode URI field in-place
  *
- * @v uri		URI
- * @v field		URI field index
+ * @v encoded		Encoded field, or NULL
  */
-static void uri_decode_inplace ( struct uri *uri, unsigned int field ) {
-	const char *encoded = uri_field ( uri, field );
-	char *decoded = ( ( char * ) encoded );
+static void uri_decode_inplace ( char *encoded ) {
+	char *decoded = encoded;
 	size_t len;
 
 	/* Do nothing if field is not present */
@@ -150,7 +148,7 @@
 	 * parser but for any other URI parsers (e.g. HTTP query
 	 * string parsers, which care about '=' and '&').
 	 */
-	static const char *escaped[URI_FIELDS] = {
+	static const char *escaped[URI_EPATH] = {
 		/* Scheme or default: escape everything */
 		[URI_SCHEME]	= "/#:@?=&",
 		/* Opaque part: escape characters which would affect
@@ -172,20 +170,21 @@
 		 * appears within paths.
 		 */
 		[URI_PATH]	= "#:@?",
-		/* Query: escape everything except '/', which
-		 * sometimes appears within queries.
-		 */
-		[URI_QUERY]	= "#:@?",
-		/* Fragment: escape everything */
-		[URI_FRAGMENT]	= "/#:@?",
 	};
 
-	return ( /* Always escape non-printing characters and whitespace */
-		 ( ! isprint ( c ) ) || ( c == ' ' ) ||
-		 /* Always escape '%' */
-		 ( c == '%' ) ||
-		 /* Escape field-specific characters */
-		 strchr ( escaped[field], c ) );
+	/* Always escape non-printing characters and whitespace */
+	if ( ( ! isprint ( c ) ) || ( c == ' ' ) )
+		return 1;
+
+	/* Escape nothing else in already-escaped fields */
+	if ( field >= URI_EPATH )
+		return 0;
+
+	/* Escape '%' and any field-specific characters */
+	if ( ( c == '%' ) || strchr ( escaped[field], c ) )
+		return 1;
+
+	return 0;
 }
 
 /**
@@ -262,10 +261,12 @@
 		DBGC ( uri, " port \"%s\"", uri->port );
 	if ( uri->path )
 		DBGC ( uri, " path \"%s\"", uri->path );
-	if ( uri->query )
-		DBGC ( uri, " query \"%s\"", uri->query );
-	if ( uri->fragment )
-		DBGC ( uri, " fragment \"%s\"", uri->fragment );
+	if ( uri->epath )
+		DBGC ( uri, " epath \"%s\"", uri->epath );
+	if ( uri->equery )
+		DBGC ( uri, " equery \"%s\"", uri->equery );
+	if ( uri->efragment )
+		DBGC ( uri, " efragment \"%s\"", uri->efragment );
 	if ( uri->params )
 		DBGC ( uri, " params \"%s\"", uri->params->name );
 }
@@ -298,17 +299,19 @@
 	char *raw;
 	char *tmp;
 	char *path;
+	char *epath;
 	char *authority;
 	size_t raw_len;
 	unsigned int field;
 
-	/* Allocate space for URI struct and a copy of the string */
+	/* Allocate space for URI struct and two copies of the string */
 	raw_len = ( strlen ( uri_string ) + 1 /* NUL */ );
-	uri = zalloc ( sizeof ( *uri ) + raw_len );
+	uri = zalloc ( sizeof ( *uri ) + ( 2 * raw_len ) );
 	if ( ! uri )
 		return NULL;
 	ref_init ( &uri->refcnt, uri_free );
 	raw = ( ( ( void * ) uri ) + sizeof ( *uri ) );
+	path = ( raw + raw_len );
 
 	/* Copy in the raw string */
 	memcpy ( raw, uri_string, raw_len );
@@ -328,7 +331,7 @@
 	/* Chop off the fragment, if it exists */
 	if ( ( tmp = strchr ( raw, '#' ) ) ) {
 		*(tmp++) = '\0';
-		uri->fragment = tmp;
+		uri->efragment = tmp;
 	}
 
 	/* Identify absolute/relative URI */
@@ -338,47 +341,47 @@
 		*(tmp++) = '\0';
 		if ( *tmp == '/' ) {
 			/* Absolute URI with hierarchical part */
-			path = tmp;
+			epath = tmp;
 		} else {
 			/* Absolute URI with opaque part */
 			uri->opaque = tmp;
-			path = NULL;
+			epath = NULL;
 		}
 	} else {
 		/* Relative URI */
-		path = raw;
+		epath = raw;
 	}
 
 	/* If we don't have a path (i.e. we have an absolute URI with
 	 * an opaque portion, we're already finished processing
 	 */
-	if ( ! path )
+	if ( ! epath )
 		goto done;
 
 	/* Chop off the query, if it exists */
-	if ( ( tmp = strchr ( path, '?' ) ) ) {
+	if ( ( tmp = strchr ( epath, '?' ) ) ) {
 		*(tmp++) = '\0';
-		uri->query = tmp;
+		uri->equery = tmp;
 	}
 
 	/* If we have no path remaining, then we're already finished
 	 * processing.
 	 */
-	if ( ! path[0] )
+	if ( ! epath[0] )
 		goto done;
 
 	/* Identify net/absolute/relative path */
-	if ( uri->scheme && ( strncmp ( path, "//", 2 ) == 0 ) ) {
+	if ( uri->scheme && ( strncmp ( epath, "//", 2 ) == 0 ) ) {
 		/* Net path.  If this is terminated by the first '/'
 		 * of an absolute path, then we have no space for a
 		 * terminator after the authority field, so shuffle
 		 * the authority down by one byte, overwriting one of
 		 * the two slashes.
 		 */
-		authority = ( path + 2 );
+		authority = ( epath + 2 );
 		if ( ( tmp = strchr ( authority, '/' ) ) ) {
 			/* Shuffle down */
-			uri->path = tmp;
+			uri->epath = tmp;
 			memmove ( ( authority - 1 ), authority,
 				  ( tmp - authority ) );
 			authority--;
@@ -386,10 +389,16 @@
 		}
 	} else {
 		/* Absolute/relative path */
-		uri->path = path;
+		uri->epath = epath;
 		authority = NULL;
 	}
 
+	/* Create copy of path for decoding */
+	if ( uri->epath ) {
+		strcpy ( path, uri->epath );
+		uri->path = path;
+	}
+
 	/* If we don't have an authority (i.e. we have a non-net
 	 * path), we're already finished processing
 	 */
@@ -421,8 +430,8 @@
 
  done:
 	/* Decode fields in-place */
-	for ( field = 0 ; field < URI_FIELDS ; field++ )
-		uri_decode_inplace ( uri, field );
+	for ( field = 0 ; field < URI_EPATH ; field++ )
+		uri_decode_inplace ( ( char * ) uri_field ( uri, field ) );
 
 	DBGC ( uri, "URI parsed \"%s\" to", uri_string );
 	uri_dump ( uri );
@@ -458,8 +467,8 @@
 	static const char prefixes[URI_FIELDS] = {
 		[URI_PASSWORD] = ':',
 		[URI_PORT] = ':',
-		[URI_QUERY] = '?',
-		[URI_FRAGMENT] = '#',
+		[URI_EQUERY] = '?',
+		[URI_EFRAGMENT] = '#',
 	};
 	char prefix;
 	size_t used = 0;
@@ -480,6 +489,10 @@
 		if ( ! uri_field ( uri, field ) )
 			continue;
 
+		/* Skip path field if encoded path is present */
+		if ( ( field == URI_PATH ) && uri->epath )
+			continue;
+
 		/* Prefix this field, if applicable */
 		prefix = prefixes[field];
 		if ( ( field == URI_HOST ) && ( uri->user != NULL ) )
@@ -676,6 +689,7 @@
 struct uri * resolve_uri ( const struct uri *base_uri,
 			   struct uri *relative_uri ) {
 	struct uri tmp_uri;
+	char *tmp_epath = NULL;
 	char *tmp_path = NULL;
 	struct uri *new_uri;
 
@@ -685,20 +699,27 @@
 
 	/* Mangle URI */
 	memcpy ( &tmp_uri, base_uri, sizeof ( tmp_uri ) );
-	if ( relative_uri->path ) {
-		tmp_path = resolve_path ( ( base_uri->path ?
-					    base_uri->path : "/" ),
-					  relative_uri->path );
+	if ( relative_uri->epath ) {
+		tmp_epath = resolve_path ( ( base_uri->epath ?
+					     base_uri->epath : "/" ),
+					   relative_uri->epath );
+		if ( ! tmp_epath )
+			goto err_epath;
+		tmp_path = strdup ( tmp_epath );
+		if ( ! tmp_path )
+			goto err_path;
+		uri_decode_inplace ( tmp_path );
+		tmp_uri.epath = tmp_epath;
 		tmp_uri.path = tmp_path;
-		tmp_uri.query = relative_uri->query;
-		tmp_uri.fragment = relative_uri->fragment;
+		tmp_uri.equery = relative_uri->equery;
+		tmp_uri.efragment = relative_uri->efragment;
 		tmp_uri.params = relative_uri->params;
-	} else if ( relative_uri->query ) {
-		tmp_uri.query = relative_uri->query;
-		tmp_uri.fragment = relative_uri->fragment;
+	} else if ( relative_uri->equery ) {
+		tmp_uri.equery = relative_uri->equery;
+		tmp_uri.efragment = relative_uri->efragment;
 		tmp_uri.params = relative_uri->params;
-	} else if ( relative_uri->fragment ) {
-		tmp_uri.fragment = relative_uri->fragment;
+	} else if ( relative_uri->efragment ) {
+		tmp_uri.efragment = relative_uri->efragment;
 		tmp_uri.params = relative_uri->params;
 	} else if ( relative_uri->params ) {
 		tmp_uri.params = relative_uri->params;
@@ -707,7 +728,14 @@
 	/* Create demangled URI */
 	new_uri = uri_dup ( &tmp_uri );
 	free ( tmp_path );
+	free ( tmp_epath );
 	return new_uri;
+
+	free ( tmp_path );
+ err_path:
+	free ( tmp_epath );
+ err_epath:
+	return NULL;
 }
 
 /**
@@ -746,6 +774,7 @@
 	if ( asprintf ( &path, "/%s", filename ) < 0 )
 		goto err_path;
 	tmp.path = path;
+	tmp.epath = path;
 
 	/* Demangle URI */
 	uri = uri_dup ( &tmp );
diff --git a/src/include/ipxe/uri.h b/src/include/ipxe/uri.h
index 3879a0e..e5b7c86 100644
--- a/src/include/ipxe/uri.h
+++ b/src/include/ipxe/uri.h
@@ -46,6 +46,20 @@
  *   scheme = "ftp", user = "joe", password = "secret",
  *   host = "insecure.org", port = "8081", path = "/hidden/path/to",
  *   query = "what=is", fragment = "this"
+ *
+ * The URI syntax includes a percent-encoding mechanism that can be
+ * used to represent characters that would otherwise not be possible,
+ * such as a '/' character within the password field.  These encodings
+ * are decoded during the URI parsing stage, thereby allowing protocol
+ * implementations to consume the raw field values directly without
+ * further decoding.
+ *
+ * Some protocols (such as HTTP) communicate using URI-encoded values.
+ * For these protocols, the original encoded substring must be
+ * retained verbatim since the choice of whether or not to encode a
+ * particular character may have significance to the receiving
+ * application.  We therefore retain the originally-encoded substrings
+ * for the path, query, and fragment fields.
  */
 struct uri {
 	/** Reference count */
@@ -62,12 +76,14 @@
 	const char *host;
 	/** Port number */
 	const char *port;
-	/** Path */
+	/** Path (after URI decoding) */
 	const char *path;
-	/** Query */
-	const char *query;
-	/** Fragment */
-	const char *fragment;
+	/** Path (with original URI encoding) */
+	const char *epath;
+	/** Query (with original URI encoding) */
+	const char *equery;
+	/** Fragment (with original URI encoding) */
+	const char *efragment;
 	/** Form parameters */
 	struct parameters *params;
 } __attribute__ (( packed ));
@@ -100,8 +116,9 @@
 	URI_HOST = URI_FIELD ( host ),
 	URI_PORT = URI_FIELD ( port ),
 	URI_PATH = URI_FIELD ( path ),
-	URI_QUERY = URI_FIELD ( query ),
-	URI_FRAGMENT = URI_FIELD ( fragment ),
+	URI_EPATH = URI_FIELD ( epath ),
+	URI_EQUERY = URI_FIELD ( equery ),
+	URI_EFRAGMENT = URI_FIELD ( efragment ),
 	URI_FIELDS
 };
 
diff --git a/src/net/tcp/httpcore.c b/src/net/tcp/httpcore.c
index 01bb496..fd94b5f 100644
--- a/src/net/tcp/httpcore.c
+++ b/src/net/tcp/httpcore.c
@@ -614,8 +614,8 @@
 
 	/* Calculate request URI length */
 	memset ( &request_uri, 0, sizeof ( request_uri ) );
-	request_uri.path = ( uri->path ? uri->path : "/" );
-	request_uri.query = uri->query;
+	request_uri.epath = ( uri->epath ? uri->epath : "/" );
+	request_uri.equery = uri->equery;
 	request_uri_len =
 		( format_uri ( &request_uri, NULL, 0 ) + 1 /* NUL */);
 
diff --git a/src/tests/uri_test.c b/src/tests/uri_test.c
index 92c2f90..929ab36 100644
--- a/src/tests/uri_test.c
+++ b/src/tests/uri_test.c
@@ -149,8 +149,10 @@
 	okx ( uristrcmp ( uri->host, expected->host ) == 0, file, line );
 	okx ( uristrcmp ( uri->port, expected->port ) == 0, file, line );
 	okx ( uristrcmp ( uri->path, expected->path ) == 0, file, line );
-	okx ( uristrcmp ( uri->query, expected->query ) == 0, file, line );
-	okx ( uristrcmp ( uri->fragment, expected->fragment ) == 0, file, line);
+	okx ( uristrcmp ( uri->epath, expected->epath ) == 0, file, line );
+	okx ( uristrcmp ( uri->equery, expected->equery ) == 0, file, line );
+	okx ( uristrcmp ( uri->efragment, expected->efragment ) == 0,
+	      file, line);
 	okx ( uri->params == expected->params, file, line );
 }
 #define uri_ok( uri, expected ) uri_okx ( uri, expected, __FILE__, __LINE__ )
@@ -490,25 +492,33 @@
 /** Basic HTTP URI */
 static struct uri_test uri_boot_ipxe_org = {
 	"http://boot.ipxe.org/demo/boot.php",
-	{ .scheme = "http", .host = "boot.ipxe.org", .path = "/demo/boot.php" }
+	{ .scheme = "http", .host = "boot.ipxe.org",
+	  .path = "/demo/boot.php", .epath = "/demo/boot.php" },
 };
 
 /** Basic opaque URI */
 static struct uri_test uri_mailto = {
 	"mailto:ipxe-devel@lists.ipxe.org",
-	{ .scheme = "mailto", .opaque = "ipxe-devel@lists.ipxe.org" }
+	{ .scheme = "mailto", .opaque = "ipxe-devel@lists.ipxe.org" },
+};
+
+/** Basic host-only URI */
+static struct uri_test uri_host = {
+	"http://boot.ipxe.org",
+	{ .scheme = "http", .host = "boot.ipxe.org" },
 };
 
 /** Basic path-only URI */
 static struct uri_test uri_path = {
 	"/var/lib/tftpboot/pxelinux.0",
-	{ .path = "/var/lib/tftpboot/pxelinux.0" },
+	{ .path = "/var/lib/tftpboot/pxelinux.0",
+	  .epath ="/var/lib/tftpboot/pxelinux.0" },
 };
 
 /** Path-only URI with escaped characters */
 static struct uri_test uri_path_escaped = {
 	"/hello%20world%3F",
-	{ .path = "/hello world?" },
+	{ .path = "/hello world?", .epath = "/hello%20world%3F" },
 };
 
 /** HTTP URI with all the trimmings */
@@ -521,8 +531,9 @@
 		.host = "example.com",
 		.port = "3001",
 		.path = "/~foo/cgi-bin/foo.pl",
-		.query = "a=b&c=d",
-		.fragment = "bit",
+		.epath = "/~foo/cgi-bin/foo.pl",
+		.equery = "a=b&c=d",
+		.efragment = "bit",
 	},
 };
 
@@ -533,8 +544,9 @@
 		.scheme = "https",
 		.host = "test.ipxe.org",
 		.path = "/wtf?\n",
-		.query = "kind#of/uri is",
-		.fragment = "this?",
+		.epath = "/wtf%3F%0A",
+		.equery = "kind%23of/uri%20is",
+		.efragment = "this%3F",
 	},
 };
 
@@ -550,8 +562,9 @@
 		.scheme = "https",
 		.host = "test.ipxe.org",
 		.path = "/wtf?\n",
-		.query = "kind#of/uri is",
-		.fragment = "this?",
+		.epath = "/wt%66%3f\n",
+		.equery = "kind%23of/uri is",
+		.efragment = "this?",
 	},
 };
 
@@ -562,6 +575,7 @@
 		.scheme = "http",
 		.host = "[2001:ba8:0:1d4::6950:5845]",
 		.path = "/",
+		.epath = "/",
 	},
 };
 
@@ -573,6 +587,7 @@
 		.host = "[2001:ba8:0:1d4::6950:5845]",
 		.port = "8001",
 		.path = "/boot",
+		.epath = "/boot",
 	},
 };
 
@@ -583,6 +598,7 @@
 		.scheme = "http",
 		.host = "[fe80::69ff:fe50:5845%net0]",
 		.path = "/ipxe",
+		.epath = "/ipxe",
 	},
 };
 
@@ -598,6 +614,7 @@
 		.scheme = "http",
 		.host = "[fe80::69ff:fe50:5845%net0]",
 		.path = "/ipxe",
+		.epath = "/ipxe",
 	},
 };
 
@@ -625,6 +642,7 @@
 	{
 		.scheme = "file",
 		.path = "/boot/script.ipxe",
+		.epath = "/boot/script.ipxe",
 	},
 };
 
@@ -635,6 +653,7 @@
 		.scheme = "file",
 		.host = "hpilo",
 		.path = "/boot/script.ipxe",
+		.epath = "/boot/script.ipxe",
 	},
 };
 
@@ -736,6 +755,7 @@
 		.scheme = "http",
 		.host = "not.a.tftp",
 		.path = "/uri",
+		.epath = "/uri",
 	},
 	"http://not.a.tftp/uri",
 };
@@ -754,6 +774,7 @@
 		.scheme = "tftp",
 		.host = "192.168.0.2",
 		.path = "//absolute/path",
+		.epath = "//absolute/path",
 	},
 	"tftp://192.168.0.2//absolute/path",
 };
@@ -772,6 +793,7 @@
 		.scheme = "tftp",
 		.host = "192.168.0.3",
 		.path = "/relative/path",
+		.epath = "/relative/path",
 	},
 	"tftp://192.168.0.3/relative/path",
 };
@@ -790,8 +812,9 @@
 		.scheme = "tftp",
 		.host = "10.0.0.6",
 		.path = "/C:\\tftpboot\\icky#path",
+		.epath = "/C:\\tftpboot\\icky#path",
 	},
-	"tftp://10.0.0.6/C%3A\\tftpboot\\icky%23path",
+	"tftp://10.0.0.6/C:\\tftpboot\\icky#path",
 };
 
 /** PXE URI with custom port */
@@ -810,6 +833,7 @@
 		.host = "192.168.0.1",
 		.port = "4069",
 		.path = "//another/path",
+		.epath = "//another/path",
 	},
 	"tftp://192.168.0.1:4069//another/path",
 };
@@ -873,6 +897,7 @@
 		.scheme = "http",
 		.host = "boot.ipxe.org",
 		.path = "/demo/boot.php",
+		.epath = "/demo/boot.php",
 	},
 	NULL,
 	uri_params_list,
@@ -902,6 +927,7 @@
 		.host = "192.168.100.4",
 		.port = "3001",
 		.path = "/register",
+		.epath = "/register",
 	},
 	"foo",
 	uri_named_params_list,
@@ -917,6 +943,7 @@
 	uri_parse_format_dup_ok ( &uri_empty );
 	uri_parse_format_dup_ok ( &uri_boot_ipxe_org );
 	uri_parse_format_dup_ok ( &uri_mailto );
+	uri_parse_format_dup_ok ( &uri_host );
 	uri_parse_format_dup_ok ( &uri_path );
 	uri_parse_format_dup_ok ( &uri_path_escaped );
 	uri_parse_format_dup_ok ( &uri_http_all );
diff --git a/src/usr/imgmgmt.c b/src/usr/imgmgmt.c
index f8d1491..b7fc829 100644
--- a/src/usr/imgmgmt.c
+++ b/src/usr/imgmgmt.c
@@ -58,8 +58,8 @@
 	memcpy ( &uri_redacted, uri, sizeof ( uri_redacted ) );
 	uri_redacted.user = NULL;
 	uri_redacted.password = NULL;
-	uri_redacted.query = NULL;
-	uri_redacted.fragment = NULL;
+	uri_redacted.equery = NULL;
+	uri_redacted.efragment = NULL;
 	uri_string_redacted = format_uri_alloc ( &uri_redacted );
 	if ( ! uri_string_redacted ) {
 		rc = -ENOMEM;