4 files changed, 243 insertions, 16 deletions
diff --git a/drivers/usb/host/xhci-hcd.c b/drivers/usb/host/xhci-hcd.c
index 5839453d342b..0d5a8564ed17 100644
--- a/drivers/usb/host/xhci-hcd.c
+++ b/drivers/usb/host/xhci-hcd.c
@@ -246,8 +246,14 @@ static void xhci_work(struct xhci_hcd *xhci)
 	/* Flush posted writes */
 	xhci_readl(xhci, &xhci->ir_set->irq_pending);
 
-	/* FIXME this should be a delayed service routine that clears the EHB */
-	xhci_handle_event(xhci);
+	if (xhci->xhc_state & XHCI_STATE_DYING)
+		xhci_dbg(xhci, "xHCI dying, ignoring interrupt. "
+				"Shouldn't IRQs be disabled?\n");
+	else
+		/* FIXME this should be a delayed service routine
+		 * that clears the EHB.
+		 */
+		xhci_handle_event(xhci);
 
 	/* Clear the event handler busy flag (RW1C); the event ring should be empty. */
 	temp_64 = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
@@ -320,7 +326,7 @@ void xhci_event_ring_work(unsigned long arg)
 	spin_lock_irqsave(&xhci->lock, flags);
 	temp = xhci_readl(xhci, &xhci->op_regs->status);
 	xhci_dbg(xhci, "op reg status = 0x%x\n", temp);
-	if (temp == 0xffffffff) {
+	if (temp == 0xffffffff || (xhci->xhc_state & XHCI_STATE_DYING)) {
 		xhci_dbg(xhci, "HW died, polling stopped.\n");
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		return;
@@ -710,16 +716,22 @@ int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags)
 		 * atomic context to this function, which may allocate memory.
 		 */
 		spin_lock_irqsave(&xhci->lock, flags);
+		if (xhci->xhc_state & XHCI_STATE_DYING)
+			goto dying;
 		ret = xhci_queue_ctrl_tx(xhci, GFP_ATOMIC, urb,
 				slot_id, ep_index);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 	} else if (usb_endpoint_xfer_bulk(&urb->ep->desc)) {
 		spin_lock_irqsave(&xhci->lock, flags);
+		if (xhci->xhc_state & XHCI_STATE_DYING)
+			goto dying;
 		ret = xhci_queue_bulk_tx(xhci, GFP_ATOMIC, urb,
 				slot_id, ep_index);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 	} else if (usb_endpoint_xfer_int(&urb->ep->desc)) {
 		spin_lock_irqsave(&xhci->lock, flags);
+		if (xhci->xhc_state & XHCI_STATE_DYING)
+			goto dying;
 		ret = xhci_queue_intr_tx(xhci, GFP_ATOMIC, urb,
 				slot_id, ep_index);
 		spin_unlock_irqrestore(&xhci->lock, flags);
@@ -728,6 +740,12 @@ int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags)
 	}
 exit:
 	return ret;
+dying:
+	xhci_dbg(xhci, "Ep 0x%x: URB %p submitted for "
+			"non-responsive xHCI host.\n",
+			urb->ep->desc.bEndpointAddress, urb);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+	return -ESHUTDOWN;
 }
 
 /*
@@ -789,6 +807,17 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 		kfree(td);
 		return ret;
 	}
+	if (xhci->xhc_state & XHCI_STATE_DYING) {
+		xhci_dbg(xhci, "Ep 0x%x: URB %p to be canceled on "
+				"non-responsive xHCI host.\n",
+				urb->ep->desc.bEndpointAddress, urb);
+		/* Let the stop endpoint command watchdog timer (which set this
+		 * state) finish cleaning up the endpoint TD lists.  We must
+		 * have caught it in the middle of dropping a lock and giving
+		 * back an URB.
+		 */
+		goto done;
+	}
 
 	xhci_dbg(xhci, "Cancel URB %p\n", urb);
 	xhci_dbg(xhci, "Event ring:\n");
@@ -806,6 +835,10 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 	 */
 	if (!(ep->ep_state & EP_HALT_PENDING)) {
 		ep->ep_state |= EP_HALT_PENDING;
+		ep->stop_cmds_pending++;
+		ep->stop_cmd_timer.expires = jiffies +
+			XHCI_STOP_EP_CMD_TIMEOUT * HZ;
+		add_timer(&ep->stop_cmd_timer);
 		xhci_queue_stop_endpoint(xhci, urb->dev->slot_id, ep_index);
 		xhci_ring_cmd_db(xhci);
 	}
@@ -1410,16 +1443,27 @@ void xhci_endpoint_reset(struct usb_hcd *hcd,
 void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 {
 	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	struct xhci_virt_device *virt_dev;
 	unsigned long flags;
 	u32 state;
+	int i;
 
 	if (udev->slot_id == 0)
 		return;
+	virt_dev = xhci->devs[udev->slot_id];
+	if (!virt_dev)
+		return;
+
+	/* Stop any wayward timer functions (which may grab the lock) */
+	for (i = 0; i < 31; ++i) {
+		virt_dev->eps[i].ep_state &= ~EP_HALT_PENDING;
+		del_timer_sync(&virt_dev->eps[i].stop_cmd_timer);
+	}
 
 	spin_lock_irqsave(&xhci->lock, flags);
 	/* Don't disable the slot if the host controller is dead. */
 	state = xhci_readl(xhci, &xhci->op_regs->status);
-	if (state == 0xffffffff) {
+	if (state == 0xffffffff || (xhci->xhc_state & XHCI_STATE_DYING)) {
 		xhci_free_virt_device(xhci, udev->slot_id);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		return;
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index b8fd270a8b0d..fd05247b7bb1 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -248,6 +248,15 @@ struct xhci_ep_ctx *xhci_get_ep_ctx(struct xhci_hcd *xhci,
 		(ctx->bytes + (ep_index * CTX_SIZE(xhci->hcc_params)));
 }
 
+static void xhci_init_endpoint_timer(struct xhci_hcd *xhci,
+		struct xhci_virt_ep *ep)
+{
+	init_timer(&ep->stop_cmd_timer);
+	ep->stop_cmd_timer.data = (unsigned long) ep;
+	ep->stop_cmd_timer.function = xhci_stop_endpoint_command_watchdog;
+	ep->xhci = xhci;
+}
+
 /* All the xhci_tds in the ring's TD list should be freed at this point */
 void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id)
 {
@@ -309,9 +318,11 @@ int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id,
 	xhci_dbg(xhci, "Slot %d input ctx = 0x%llx (dma)\n", slot_id,
 			(unsigned long long)dev->in_ctx->dma);
 
-	/* Initialize the cancellation list for each endpoint */
-	for (i = 0; i < 31; i++)
+	/* Initialize the cancellation list and watchdog timers for each ep */
+	for (i = 0; i < 31; i++) {
+		xhci_init_endpoint_timer(xhci, &dev->eps[i]);
 		INIT_LIST_HEAD(&dev->eps[i].cancelled_td_list);
+	}
 
 	/* Allocate endpoint 0 ring */
 	dev->eps[0].ring = xhci_ring_alloc(xhci, 1, true, flags);
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 184e8b6f30b2..9541e88df68f 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -475,6 +475,35 @@ void xhci_queue_new_dequeue_state(struct xhci_hcd *xhci,
 	ep->ep_state |= SET_DEQ_PENDING;
 }
 
+static inline void xhci_stop_watchdog_timer_in_irq(struct xhci_hcd *xhci,
+		struct xhci_virt_ep *ep)
+{
+	ep->ep_state &= ~EP_HALT_PENDING;
+	/* Can't del_timer_sync in interrupt, so we attempt to cancel.  If the
+	 * timer is running on another CPU, we don't decrement stop_cmds_pending
+	 * (since we didn't successfully stop the watchdog timer).
+	 */
+	if (del_timer(&ep->stop_cmd_timer))
+		ep->stop_cmds_pending--;
+}
+
+/* Must be called with xhci->lock held in interrupt context */
+static void xhci_giveback_urb_in_irq(struct xhci_hcd *xhci,
+		struct xhci_td *cur_td, int status, char *adjective)
+{
+	struct usb_hcd *hcd = xhci_to_hcd(xhci);
+
+	cur_td->urb->hcpriv = NULL;
+	usb_hcd_unlink_urb_from_ep(hcd, cur_td->urb);
+	xhci_dbg(xhci, "Giveback %s URB %p\n", adjective, cur_td->urb);
+
+	spin_unlock(&xhci->lock);
+	usb_hcd_giveback_urb(hcd, cur_td->urb, status);
+	kfree(cur_td);
+	spin_lock(&xhci->lock);
+	xhci_dbg(xhci, "%s URB given back\n", adjective);
+}
+
 /*
  * When we get a command completion for a Stop Endpoint Command, we need to
  * unlink any cancelled TDs from the ring.  There are two ways to do that:
@@ -508,7 +537,7 @@ static void handle_stopped_endpoint(struct xhci_hcd *xhci,
 	ep_ring = ep->ring;
 
 	if (list_empty(&ep->cancelled_td_list)) {
-		ep->ep_state &= ~EP_HALT_PENDING;
+		xhci_stop_watchdog_timer_in_irq(xhci, ep);
 		ring_ep_doorbell(xhci, slot_id, ep_index);
 		return;
 	}
@@ -540,7 +569,7 @@ static void handle_stopped_endpoint(struct xhci_hcd *xhci,
 		list_del(&cur_td->td_list);
 	}
 	last_unlinked_td = cur_td;
-	ep->ep_state &= ~EP_HALT_PENDING;
+	xhci_stop_watchdog_timer_in_irq(xhci, ep);
 
 	/* If necessary, queue a Set Transfer Ring Dequeue Pointer command */
 	if (deq_state.new_deq_ptr && deq_state.new_deq_seg) {
@@ -568,23 +597,136 @@ static void handle_stopped_endpoint(struct xhci_hcd *xhci,
 		hcd_stat_update(xhci->tp_stat, cur_td->urb->actual_length,
 				ktime_sub(stop_time, cur_td->start_time));
 #endif
-		cur_td->urb->hcpriv = NULL;
-		usb_hcd_unlink_urb_from_ep(xhci_to_hcd(xhci), cur_td->urb);
-
-		xhci_dbg(xhci, "Giveback cancelled URB %p\n", cur_td->urb);
-		spin_unlock(&xhci->lock);
 		/* Doesn't matter what we pass for status, since the core will
 		 * just overwrite it (because the URB has been unlinked).
 		 */
-		usb_hcd_giveback_urb(xhci_to_hcd(xhci), cur_td->urb, 0);
-		kfree(cur_td);
+		xhci_giveback_urb_in_irq(xhci, cur_td, 0, "cancelled");
 
-		spin_lock(&xhci->lock);
+		/* Stop processing the cancelled list if the watchdog timer is
+		 * running.
+		 */
+		if (xhci->xhc_state & XHCI_STATE_DYING)
+			return;
 	} while (cur_td != last_unlinked_td);
 
 	/* Return to the event handler with xhci->lock re-acquired */
 }
 
+/* Watchdog timer function for when a stop endpoint command fails to complete.
+ * In this case, we assume the host controller is broken or dying or dead.  The
+ * host may still be completing some other events, so we have to be careful to
+ * let the event ring handler and the URB dequeueing/enqueueing functions know
+ * through xhci->state.
+ *
+ * The timer may also fire if the host takes a very long time to respond to the
+ * command, and the stop endpoint command completion handler cannot delete the
+ * timer before the timer function is called.  Another endpoint cancellation may
+ * sneak in before the timer function can grab the lock, and that may queue
+ * another stop endpoint command and add the timer back.  So we cannot use a
+ * simple flag to say whether there is a pending stop endpoint command for a
+ * particular endpoint.
+ *
+ * Instead we use a combination of that flag and a counter for the number of
+ * pending stop endpoint commands.  If the timer is the tail end of the last
+ * stop endpoint command, and the endpoint's command is still pending, we assume
+ * the host is dying.
+ */
+void xhci_stop_endpoint_command_watchdog(unsigned long arg)
+{
+	struct xhci_hcd *xhci;
+	struct xhci_virt_ep *ep;
+	struct xhci_virt_ep *temp_ep;
+	struct xhci_ring *ring;
+	struct xhci_td *cur_td;
+	int ret, i, j;
+
+	ep = (struct xhci_virt_ep *) arg;
+	xhci = ep->xhci;
+
+	spin_lock(&xhci->lock);
+
+	ep->stop_cmds_pending--;
+	if (xhci->xhc_state & XHCI_STATE_DYING) {
+		xhci_dbg(xhci, "Stop EP timer ran, but another timer marked "
+				"xHCI as DYING, exiting.\n");
+		spin_unlock(&xhci->lock);
+		return;
+	}
+	if (!(ep->stop_cmds_pending == 0 && (ep->ep_state & EP_HALT_PENDING))) {
+		xhci_dbg(xhci, "Stop EP timer ran, but no command pending, "
+				"exiting.\n");
+		spin_unlock(&xhci->lock);
+		return;
+	}
+
+	xhci_warn(xhci, "xHCI host not responding to stop endpoint command.\n");
+	xhci_warn(xhci, "Assuming host is dying, halting host.\n");
+	/* Oops, HC is dead or dying or at least not responding to the stop
+	 * endpoint command.
+	 */
+	xhci->xhc_state |= XHCI_STATE_DYING;
+	/* Disable interrupts from the host controller and start halting it */
+	xhci_quiesce(xhci);
+	spin_unlock(&xhci->lock);
+
+	ret = xhci_halt(xhci);
+
+	spin_lock(&xhci->lock);
+	if (ret < 0) {
+		/* This is bad; the host is not responding to commands and it's
+		 * not allowing itself to be halted.  At least interrupts are
+		 * disabled, so we can set HC_STATE_HALT and notify the
+		 * USB core.  But if we call usb_hc_died(), it will attempt to
+		 * disconnect all device drivers under this host.  Those
+		 * disconnect() methods will wait for all URBs to be unlinked,
+		 * so we must complete them.
+		 */
+		xhci_warn(xhci, "Non-responsive xHCI host is not halting.\n");
+		xhci_warn(xhci, "Completing active URBs anyway.\n");
+		/* We could turn all TDs on the rings to no-ops.  This won't
+		 * help if the host has cached part of the ring, and is slow if
+		 * we want to preserve the cycle bit.  Skip it and hope the host
+		 * doesn't touch the memory.
+		 */
+	}
+	for (i = 0; i < MAX_HC_SLOTS; i++) {
+		if (!xhci->devs[i])
+			continue;
+		for (j = 0; j < 31; j++) {
+			temp_ep = &xhci->devs[i]->eps[j];
+			ring = temp_ep->ring;
+			if (!ring)
+				continue;
+			xhci_dbg(xhci, "Killing URBs for slot ID %u, "
+					"ep index %u\n", i, j);
+			while (!list_empty(&ring->td_list)) {
+				cur_td = list_first_entry(&ring->td_list,
+						struct xhci_td,
+						td_list);
+				list_del(&cur_td->td_list);
+				if (!list_empty(&cur_td->cancelled_td_list))
+					list_del(&cur_td->cancelled_td_list);
+				xhci_giveback_urb_in_irq(xhci, cur_td,
+						-ESHUTDOWN, "killed");
+			}
+			while (!list_empty(&temp_ep->cancelled_td_list)) {
+				cur_td = list_first_entry(
+						&temp_ep->cancelled_td_list,
+						struct xhci_td,
+						cancelled_td_list);
+				list_del(&cur_td->cancelled_td_list);
+				xhci_giveback_urb_in_irq(xhci, cur_td,
+						-ESHUTDOWN, "killed");
+			}
+		}
+	}
+	spin_unlock(&xhci->lock);
+	xhci_to_hcd(xhci)->state = HC_STATE_HALT;
+	xhci_dbg(xhci, "Calling usb_hc_died()\n");
+	usb_hc_died(xhci_to_hcd(xhci));
+	xhci_dbg(xhci, "xHCI host controller is dead.\n");
+}
+
 /*
  * When we get a completion for a Set Transfer Ring Dequeue Pointer command,
  * we need to clear the set deq pending flag in the endpoint ring state, so that
@@ -1333,6 +1475,14 @@ void xhci_handle_event(struct xhci_hcd *xhci)
 	default:
 		xhci->error_bitmask |= 1 << 3;
 	}
+	/* Any of the above functions may drop and re-acquire the lock, so check
+	 * to make sure a watchdog timer didn't mark the host as non-responsive.
+	 */
+	if (xhci->xhc_state & XHCI_STATE_DYING) {
+		xhci_dbg(xhci, "xHCI host dying, returning from "
+				"event handler.\n");
+		return;
+	}
 
 	if (update_ptrs) {
 		/* Update SW and HC event ring dequeue pointer */
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index af3c5638526c..c92f84154fb5 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -659,6 +659,10 @@ struct xhci_virt_ep {
 	/* The TRB that was last reported in a stopped endpoint ring */
 	union xhci_trb		*stopped_trb;
 	struct xhci_td		*stopped_td;
+	/* Watchdog timer for stop endpoint command to cancel URBs */
+	struct timer_list	stop_cmd_timer;
+	int			stop_cmds_pending;
+	struct xhci_hcd		*xhci;
 };
 
 struct xhci_virt_device {
@@ -1022,6 +1026,8 @@ struct xhci_scratchpad {
 #define	ERST_ENTRIES	1
 /* Poll every 60 seconds */
 #define	POLL_TIMEOUT	60
+/* Stop endpoint command timeout (secs) for URB cancellation watchdog timer */
+#define XHCI_STOP_EP_CMD_TIMEOUT	5
 /* XXX: Make these module parameters */
 
 
@@ -1083,6 +1089,21 @@ struct xhci_hcd {
 	struct timer_list	event_ring_timer;
 	int			zombie;
 #endif
+	/* Host controller watchdog timer structures */
+	unsigned int		xhc_state;
+/* Host controller is dying - not responding to commands. "I'm not dead yet!"
+ *
+ * xHC interrupts have been disabled and a watchdog timer will (or has already)
+ * halt the xHCI host, and complete all URBs with an -ESHUTDOWN code.  Any code
+ * that sees this status (other than the timer that set it) should stop touching
+ * hardware immediately.  Interrupt handlers should return immediately when
+ * they see this status (any time they drop and re-acquire xhci->lock).
+ * xhci_urb_dequeue() should call usb_hcd_check_unlink_urb() and return without
+ * putting the TD on the canceled list, etc.
+ *
+ * There are no reports of xHCI host controllers that display this issue.
+ */
+#define XHCI_STATE_DYING	(1 << 0)
 	/* Statistics */
 	int			noops_submitted;
 	int			noops_handled;
@@ -1279,6 +1300,7 @@ void xhci_cleanup_stalled_ring(struct xhci_hcd *xhci,
 void xhci_queue_config_ep_quirk(struct xhci_hcd *xhci,
 		unsigned int slot_id, unsigned int ep_index,
 		struct xhci_dequeue_state *deq_state);
+void xhci_stop_endpoint_command_watchdog(unsigned long arg);
 
 /* xHCI roothub code */
 int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex,