[xen] Support scatter-gather to allow for jumbo frames

The use of jumbo frames for the Xen netfront virtual NIC requires the
use of scatter-gather ("feature-sg"), with the receive descriptor ring
becoming a list of page-sized buffers and the backend using as many
page buffers as required for each packet.

Since iPXE's abstraction of an I/O buffer does not include any sort of
scatter-gather list, this requires an extra allocation and copy on the
receive datapath for any packet that spans more than a single page.

This support is required in order to successfully boot an AWS EC2
virtual machine (with non-enhanced networking) via iSCSI if jumbo
frames are enabled, since the netback driver used in EC2 seems not to
allow "feature-sg" to be renegotiated once the Linux kernel driver
takes over.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
diff --git a/src/drivers/net/netfront.c b/src/drivers/net/netfront.c
index be21085..1203e58 100644
--- a/src/drivers/net/netfront.c
+++ b/src/drivers/net/netfront.c
@@ -56,7 +56,7 @@
 	__einfo_uniqify ( EINFO_EIO, -NETIF_RSP_DROPPED,		\
 			  "Packet dropped" )
 #define EIO_NETIF_RSP( status )						\
-	EUNIQ ( EINFO_EIO, -(status),					\
+	EUNIQ ( EINFO_EIO, ( -(status) & 0x1f ),			\
 		EIO_NETIF_RSP_ERROR, EIO_NETIF_RSP_DROPPED )
 
 /******************************************************************************
@@ -326,6 +326,7 @@
 				  struct netfront_ring *ring ) {
 	struct xen_device *xendev = netfront->xendev;
 	struct xen_hypervisor *xen = xendev->xen;
+	physaddr_t addr;
 	unsigned int i;
 	int rc;
 
@@ -345,11 +346,11 @@
 	}
 
 	/* Grant access to shared ring */
+	addr = virt_to_phys ( ring->sring.raw );
 	if ( ( rc = xengrant_permit_access ( xen, ring->ref, xendev->backend_id,
-					     0, ring->sring.raw ) ) != 0 ) {
+					     0, addr ) ) != 0 ) {
 		DBGC ( netfront, "NETFRONT %s could not permit access to "
-		       "%#08lx: %s\n", xendev->key,
-		       virt_to_phys ( ring->sring.raw ), strerror ( rc ) );
+		       "%#08lx: %s\n", xendev->key, addr, strerror ( rc ) );
 		goto err_permit_access;
 	}
 
@@ -358,10 +359,8 @@
 					 ring->ref ) ) != 0 )
 		goto err_write_num;
 
-	DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n",
-	       xendev->key, ring->ref_key, ring->ref,
-	       virt_to_phys ( ring->sring.raw ),
-	       ( virt_to_phys ( ring->sring.raw ) + PAGE_SIZE ) );
+	DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n", xendev->key,
+	       ring->ref_key, ring->ref, addr, ( addr + PAGE_SIZE ) );
 	return 0;
 
 	netfront_rm ( netfront, ring->ref_key );
@@ -378,7 +377,8 @@
  *
  * @v netfront		Netfront device
  * @v ring		Descriptor ring
- * @v iobuf		I/O buffer
+ * @v addr		Physical address
+ * @v iobuf		Associated I/O buffer, or NULL
  * @v id		Buffer ID to fill in
  * @v ref		Grant reference to fill in
  * @ret rc		Return status code
@@ -387,8 +387,9 @@
  * ring.
  */
 static int netfront_push ( struct netfront_nic *netfront,
-			   struct netfront_ring *ring, struct io_buffer *iobuf,
-			   uint16_t *id, grant_ref_t *ref ) {
+			   struct netfront_ring *ring, physaddr_t addr,
+			   struct io_buffer *iobuf, uint16_t *id,
+			   grant_ref_t *ref ) {
 	struct xen_device *xendev = netfront->xendev;
 	struct xen_hypervisor *xen = xendev->xen;
 	unsigned int next_id;
@@ -402,19 +403,15 @@
 	next_id = ring->ids[ ring->id_prod & ( ring->count - 1 ) ];
 	next_ref = ring->refs[next_id];
 
-	/* Grant access to I/O buffer page.  I/O buffers are naturally
-	 * aligned, so we never need to worry about crossing a page
-	 * boundary.
-	 */
+	/* Grant access to page containing address */
 	if ( ( rc = xengrant_permit_access ( xen, next_ref, xendev->backend_id,
-					     0, iobuf->data ) ) != 0 ) {
+					     0, addr ) ) != 0 ) {
 		DBGC ( netfront, "NETFRONT %s could not permit access to "
-		       "%#08lx: %s\n", xendev->key,
-		       virt_to_phys ( iobuf->data ), strerror ( rc ) );
+		       "%#08lx: %s\n", xendev->key, addr, strerror ( rc ) );
 		return rc;
 	}
 
-	/* Store I/O buffer */
+	/* Store associated I/O buffer, if any */
 	assert ( ring->iobufs[next_id] == NULL );
 	ring->iobufs[next_id] = iobuf;
 
@@ -434,7 +431,7 @@
  * @v netfront		Netfront device
  * @v ring		Descriptor ring
  * @v id		Buffer ID
- * @ret iobuf		I/O buffer
+ * @ret iobuf		Associated I/O buffer, if any
  */
 static struct io_buffer * netfront_pull ( struct netfront_nic *netfront,
 					  struct netfront_ring *ring,
@@ -451,7 +448,6 @@
 
 	/* Retrieve I/O buffer */
 	iobuf = ring->iobufs[id];
-	assert ( iobuf != NULL );
 	ring->iobufs[id] = NULL;
 
 	/* Free buffer ID */
@@ -494,6 +490,22 @@
 	ring->sring.raw = NULL;
 }
 
+/**
+ * Discard partially received I/O buffers
+ *
+ * @v netfront		Netfront device
+ */
+static void netfront_discard ( struct netfront_nic *netfront ) {
+	struct io_buffer *iobuf;
+	struct io_buffer *tmp;
+
+	/* Discard all buffers in the list */
+	list_for_each_entry_safe ( iobuf, tmp, &netfront->rx_partial, list ) {
+		list_del ( &iobuf->list );
+		free_iob ( iobuf );
+	}
+}
+
 /******************************************************************************
  *
  * Network device interface
@@ -512,6 +524,7 @@
 	struct io_buffer *iobuf;
 	struct netif_rx_request *request;
 	unsigned int refilled = 0;
+	physaddr_t addr;
 	int notify;
 	int rc;
 
@@ -524,24 +537,24 @@
 			/* Wait for next refill */
 			break;
 		}
+		addr = virt_to_phys ( iobuf->data );
 
 		/* Add to descriptor ring */
 		request = RING_GET_REQUEST ( &netfront->rx_fring,
 					     netfront->rx_fring.req_prod_pvt );
-		if ( ( rc = netfront_push ( netfront, &netfront->rx,
+		if ( ( rc = netfront_push ( netfront, &netfront->rx, addr,
 					    iobuf, &request->id,
 					    &request->gref ) ) != 0 ) {
 			netdev_rx_err ( netdev, iobuf, rc );
 			break;
 		}
 		DBGC2 ( netfront, "NETFRONT %s RX id %d ref %d is %#08lx+%zx\n",
-			xendev->key, request->id, request->gref,
-			virt_to_phys ( iobuf->data ), iob_tailroom ( iobuf ) );
+			xendev->key, request->id, request->gref, addr,
+			iob_tailroom ( iobuf ) );
 
 		/* Move to next descriptor */
 		netfront->rx_fring.req_prod_pvt++;
 		refilled++;
-
 	}
 
 	/* Push new descriptors and notify backend if applicable */
@@ -593,6 +606,10 @@
 	if ( ( rc = netfront_write_flag ( netfront, "request-rx-copy" ) ) != 0 )
 		goto err_request_rx_copy;
 
+	/* Inform backend that we can support scatter-gather */
+	if ( ( rc = netfront_write_flag ( netfront, "feature-sg" ) ) != 0 )
+		goto err_feature_sg;
+
 	/* Disable checksum offload, since we will always do the work anyway */
 	if ( ( rc = netfront_write_flag ( netfront,
 					  "feature-no-csum-offload" ) ) != 0 )
@@ -632,6 +649,8 @@
  err_feature_rx_notify:
 	netfront_rm ( netfront, "feature-no-csum-offload" );
  err_feature_no_csum_offload:
+	netfront_rm ( netfront, "feature-sg" );
+ err_feature_sg:
 	netfront_rm ( netfront, "request-rx-copy" );
  err_request_rx_copy:
 	netfront_destroy_event ( netfront );
@@ -675,11 +694,15 @@
 	/* Delete flags */
 	netfront_rm ( netfront, "feature-rx-notify" );
 	netfront_rm ( netfront, "feature-no-csum-offload" );
+	netfront_rm ( netfront, "feature-sg" );
 	netfront_rm ( netfront, "request-rx-copy" );
 
 	/* Destroy event channel */
 	netfront_destroy_event ( netfront );
 
+	/* Discard any partially received I/O buffers */
+	netfront_discard ( netfront );
+
 	/* Destroy receive descriptor ring, freeing any outstanding
 	 * I/O buffers.
 	 */
@@ -703,34 +726,66 @@
 	struct netfront_nic *netfront = netdev->priv;
 	struct xen_device *xendev = netfront->xendev;
 	struct netif_tx_request *request;
+	physaddr_t addr;
+	size_t len;
+	size_t remaining;
+	size_t frag_len;
+	unsigned int offset;
+	unsigned int count;
+	unsigned int more;
 	int notify;
 	int rc;
 
+	/* Calculate number of page buffers required */
+	addr = virt_to_phys ( iobuf->data );
+	len = iob_len ( iobuf );
+	offset = ( addr & ( PAGE_SIZE - 1 ) );
+	count = ( ( offset + len + PAGE_SIZE - 1 ) / PAGE_SIZE );
+
 	/* Check that we have space in the ring */
-	if ( netfront_ring_is_full ( &netfront->tx ) ) {
+	if ( netfront_ring_space ( &netfront->tx ) < count ) {
 		DBGC ( netfront, "NETFRONT %s out of transmit descriptors\n",
 		       xendev->key );
 		return -ENOBUFS;
 	}
 
 	/* Add to descriptor ring */
-	request = RING_GET_REQUEST ( &netfront->tx_fring,
-				     netfront->tx_fring.req_prod_pvt );
-	if ( ( rc = netfront_push ( netfront, &netfront->tx, iobuf,
-				    &request->id, &request->gref ) ) != 0 ) {
-		return rc;
+	remaining = len;
+	while ( remaining ) {
+
+		/* Calculate length of this fragment */
+		frag_len = ( PAGE_SIZE - offset );
+		if ( frag_len >= remaining ) {
+			frag_len = remaining;
+			more = 0;
+		} else {
+			more = NETTXF_more_data;
+		}
+
+		/* Populate request */
+		request = RING_GET_REQUEST ( &netfront->tx_fring,
+					     netfront->tx_fring.req_prod_pvt );
+		if ( ( rc = netfront_push ( netfront, &netfront->tx, addr,
+					    ( more ? NULL : iobuf ),
+					    &request->id,
+					    &request->gref ) ) != 0 ) {
+			return rc;
+		}
+		request->flags = ( NETTXF_data_validated | more );
+		request->offset = offset;
+		request->size = ( ( remaining == len ) ? len : frag_len );
+		DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is "
+			"%#08lx+%zx%s\n", xendev->key, request->id,
+			request->gref, addr, frag_len, ( more ? "..." : "" ) );
+
+		/* Move to next descriptor */
+		netfront->tx_fring.req_prod_pvt++;
+		addr += frag_len;
+		remaining -= frag_len;
+		offset = 0;
 	}
-	request->offset = ( virt_to_phys ( iobuf->data ) & ( PAGE_SIZE - 1 ) );
-	request->flags = NETTXF_data_validated;
-	request->size = iob_len ( iobuf );
-	DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is %#08lx+%zx\n",
-		xendev->key, request->id, request->gref,
-		virt_to_phys ( iobuf->data ), iob_len ( iobuf ) );
 
-	/* Consume descriptor */
-	netfront->tx_fring.req_prod_pvt++;
-
-	/* Push new descriptor and notify backend if applicable */
+	/* Push new descriptors and notify backend if applicable */
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY ( &netfront->tx_fring, notify );
 	if ( notify )
 		netfront_send_event ( netfront );
@@ -748,7 +803,7 @@
 	struct xen_device *xendev = netfront->xendev;
 	struct netif_tx_response *response;
 	struct io_buffer *iobuf;
-	unsigned int status;
+	int status;
 	int rc;
 
 	/* Consume any unconsumed responses */
@@ -761,10 +816,11 @@
 		/* Retrieve from descriptor ring */
 		iobuf = netfront_pull ( netfront, &netfront->tx, response->id );
 		status = response->status;
-		if ( status == NETIF_RSP_OKAY ) {
+		if ( status >= NETIF_RSP_OKAY ) {
 			DBGC2 ( netfront, "NETFRONT %s TX id %d complete\n",
 				xendev->key, response->id );
-			netdev_tx_complete ( netdev, iobuf );
+			if ( iobuf )
+				netdev_tx_complete ( netdev, iobuf );
 		} else {
 			rc = -EIO_NETIF_RSP ( status );
 			DBGC2 ( netfront, "NETFRONT %s TX id %d error %d: %s\n",
@@ -786,6 +842,7 @@
 	struct netif_rx_response *response;
 	struct io_buffer *iobuf;
 	int status;
+	int more;
 	size_t len;
 	int rc;
 
@@ -799,21 +856,45 @@
 		/* Retrieve from descriptor ring */
 		iobuf = netfront_pull ( netfront, &netfront->rx, response->id );
 		status = response->status;
-		if ( status >= 0 ) {
-			len = status;
-			iob_reserve ( iobuf, response->offset );
-			iob_put ( iobuf, len );
-			DBGC2 ( netfront, "NETFRONT %s RX id %d complete "
-				"%#08lx+%zx\n", xendev->key, response->id,
-				virt_to_phys ( iobuf->data ), len );
-			netdev_rx ( netdev, iobuf );
-		} else {
+		more = ( response->flags & NETRXF_more_data );
+
+		/* Report errors */
+		if ( status < 0 ) {
 			rc = -EIO_NETIF_RSP ( status );
 			DBGC2 ( netfront, "NETFRONT %s RX id %d error %d: %s\n",
 				xendev->key, response->id, status,
 				strerror ( rc ) );
+			netfront_discard ( netfront );
 			netdev_rx_err ( netdev, iobuf, rc );
+			continue;
 		}
+
+		/* Add to partial receive list */
+		len = status;
+		iob_reserve ( iobuf, response->offset );
+		iob_put ( iobuf, len );
+		DBGC2 ( netfront, "NETFRONT %s RX id %d complete "
+			"%#08lx+%zx%s\n", xendev->key, response->id,
+			virt_to_phys ( iobuf->data ), len,
+			( more ? "..." : "" ) );
+		list_add_tail ( &iobuf->list, &netfront->rx_partial );
+
+		/* Wait until complete packet has been received */
+		if ( more )
+			continue;
+
+		/* Reassemble complete packet */
+		iobuf = iob_concatenate ( &netfront->rx_partial );
+		if ( ! iobuf ) {
+			DBGC2 ( netfront, "NETFRONT %s RX reassembly failed\n",
+				xendev->key );
+			netfront_discard ( netfront );
+			netdev_rx_err ( netdev, NULL, -ENOMEM );
+			continue;
+		}
+
+		/* Hand off to network stack */
+		netdev_rx ( netdev, iobuf );
 	}
 }
 
@@ -871,6 +952,7 @@
 	netdev->dev = &xendev->dev;
 	netfront = netdev->priv;
 	netfront->xendev = xendev;
+	INIT_LIST_HEAD ( &netfront->rx_partial );
 	DBGC ( netfront, "NETFRONT %s backend=\"%s\" in domain %ld\n",
 	       xendev->key, xendev->backend, xendev->backend_id );
 
diff --git a/src/drivers/net/netfront.h b/src/drivers/net/netfront.h
index c95ed26..dca3ff1 100644
--- a/src/drivers/net/netfront.h
+++ b/src/drivers/net/netfront.h
@@ -65,7 +65,7 @@
 	size_t count;
 	/** I/O buffers, indexed by buffer ID */
 	struct io_buffer **iobufs;
-	/** I/O buffer grant references, indexed by buffer ID */
+	/** Grant references, indexed by buffer ID */
 	grant_ref_t *refs;
 
 	/** Buffer ID ring */
@@ -117,6 +117,18 @@
 }
 
 /**
+ * Calculate descriptor ring remaining space
+ *
+ * @v ring		Descriptor ring
+ * @v space		Number of unused entries
+ */
+static inline __attribute__ (( always_inline )) unsigned int
+netfront_ring_space ( struct netfront_ring *ring ) {
+
+	return ( ring->count - netfront_ring_fill ( ring ) );
+}
+
+/**
  * Check whether or not descriptor ring is full
  *
  * @v ring		Descriptor ring
@@ -164,6 +176,8 @@
 	struct io_buffer *rx_iobufs[NETFRONT_NUM_RX_DESC];
 	/** Receive I/O buffer IDs */
 	uint8_t rx_ids[NETFRONT_NUM_RX_DESC];
+	/** Partial receive I/O buffer list */
+	struct list_head rx_partial;
 
 	/** Event channel */
 	struct evtchn_send event;
diff --git a/src/include/ipxe/xengrant.h b/src/include/ipxe/xengrant.h
index 451a3ce..fcb7a71 100644
--- a/src/include/ipxe/xengrant.h
+++ b/src/include/ipxe/xengrant.h
@@ -166,16 +166,17 @@
  * @v ref		Grant reference
  * @v domid		Domain ID
  * @v subflags		Additional flags
- * @v page		Page start
+ * @v addr		Physical address within page
  * @ret rc		Return status code
  */
 static inline __attribute__ (( always_inline )) int
 xengrant_permit_access ( struct xen_hypervisor *xen, grant_ref_t ref,
-			 domid_t domid, unsigned int subflags, void *page ) {
+			 domid_t domid, unsigned int subflags,
+			 physaddr_t addr ) {
 	struct grant_entry_header *hdr = xengrant_header ( xen, ref );
 	struct grant_entry_v1 *v1 = xengrant_v1 ( hdr );
 	union grant_entry_v2 *v2 = xengrant_v2 ( hdr );
-	unsigned long frame = ( virt_to_phys ( page ) / PAGE_SIZE );
+	unsigned long frame = ( addr / PAGE_SIZE );
 
 	/* Fail (for test purposes) if applicable */
 	if ( ( XENGRANT_FAIL_RATE > 0 ) &&