[xhci] Allow for permanent failure of the command mechanism

Some xHCI controllers (observed with the Thunderbolt ports on a
ThinkPad X1 Extreme Gen3 and a ThinkPad P53) seem to suffer a
catastrophic failure at the point that ExitBootServices() is called if
the IOMMU is enabled.  The symptoms appear to be consistent with
another UEFI driver (e.g. the IOMMU driver, or the Thunderbolt driver)
having torn down the DMA mappings, leaving the xHCI controller unable
to write to host memory.  The observable effect is that all commands
fail with a timeout, and attempts to abort command execution similarly
fail since the xHCI controller is unable to report the abort
completion.

Check for failure to abort a command, and respond by performing a full
device reset (as recommended by the xHCI specification) and by marking
the device as permanently failed.

Reported-by: Andreas Hammarskjöld <junior@2PintSoftware.com>
Signed-off-by: Michael Brown <mcb30@ipxe.org>
diff --git a/src/drivers/usb/xhci.c b/src/drivers/usb/xhci.c
index cc48af0..3d98b1e 100644
--- a/src/drivers/usb/xhci.c
+++ b/src/drivers/usb/xhci.c
@@ -1165,6 +1165,31 @@
 	return -ETIMEDOUT;
 }
 
+/**
+ * Mark xHCI device as permanently failed
+ *
+ * @v xhci		xHCI device
+ * @ret rc		Return status code
+ */
+static int xhci_fail ( struct xhci_device *xhci ) {
+	size_t len;
+	int rc;
+
+	/* Mark command mechanism as permanently failed */
+	xhci->failed = 1;
+
+	/* Reset device */
+	if ( ( rc = xhci_reset ( xhci ) ) != 0 )
+		return rc;
+
+	/* Discard DCBAA entries since DCBAAP has been cleared */
+	assert ( xhci->dcbaa.context != NULL );
+	len = ( ( xhci->slots + 1 ) * sizeof ( xhci->dcbaa.context[0] ) );
+	memset ( xhci->dcbaa.context, 0, len );
+
+	return 0;
+}
+
 /******************************************************************************
  *
  * Transfer request blocks
@@ -1720,6 +1745,10 @@
 	unsigned int consumed;
 	unsigned int type;
 
+	/* Do nothing if device has permanently failed */
+	if ( xhci->failed )
+		return;
+
 	/* Poll for events */
 	profile_start ( &xhci_event_profiler );
 	for ( consumed = 0 ; ; consumed++ ) {
@@ -1778,6 +1807,7 @@
  */
 static void xhci_abort ( struct xhci_device *xhci ) {
 	physaddr_t crp;
+	uint32_t crcr;
 
 	/* Abort the command */
 	DBGC2 ( xhci, "XHCI %s aborting command\n", xhci->name );
@@ -1786,8 +1816,18 @@
 	/* Allow time for command to abort */
 	mdelay ( XHCI_COMMAND_ABORT_DELAY_MS );
 
-	/* Sanity check */
-	assert ( ( readl ( xhci->op + XHCI_OP_CRCR ) & XHCI_CRCR_CRR ) == 0 );
+	/* Check for failure to abort */
+	crcr = readl ( xhci->op + XHCI_OP_CRCR );
+	if ( crcr & XHCI_CRCR_CRR ) {
+
+		/* Device has failed to abort a command and is almost
+		 * certainly beyond repair.  Reset device, abandoning
+		 * all state, and mark device as failed to avoid
+		 * delays on any future command attempts.
+		 */
+		DBGC ( xhci, "XHCI %s failed to abort command\n", xhci->name );
+		xhci_fail ( xhci );
+	}
 
 	/* Consume (and ignore) any final command status */
 	xhci_event_poll ( xhci );
@@ -1813,6 +1853,12 @@
 	unsigned int i;
 	int rc;
 
+	/* Immediately fail all commands if command mechanism has failed */
+	if ( xhci->failed ) {
+		rc = -EPIPE;
+		goto err_failed;
+	}
+
 	/* Sanity check */
 	if ( xhci->pending ) {
 		DBGC ( xhci, "XHCI %s command ring busy\n", xhci->name );
@@ -1863,6 +1909,7 @@
  err_enqueue:
 	xhci->pending = NULL;
  err_pending:
+ err_failed:
 	return rc;
 }
 
diff --git a/src/drivers/usb/xhci.h b/src/drivers/usb/xhci.h
index 6e02d70..a3c8888 100644
--- a/src/drivers/usb/xhci.h
+++ b/src/drivers/usb/xhci.h
@@ -1115,6 +1115,8 @@
 	struct xhci_event_ring event;
 	/** Current command (if any) */
 	union xhci_trb *pending;
+	/** Command mechanism has permanently failed */
+	int failed;
 
 	/** Device slots, indexed by slot ID */
 	struct xhci_slot **slot;