summaryrefslogtreecommitdiffstats
path: root/drivers/scsi/libsas/sas_scsi_host.c
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2011-11-28 11:29:20 -0800
committerJames Bottomley <JBottomley@Parallels.com>2012-02-19 14:06:08 -0600
commit9095a64a9aead653df320e3a6fc70835c15d46e4 (patch)
tree63ad804f78668bf28d90ea2f8c9640ce92dacdea /drivers/scsi/libsas/sas_scsi_host.c
parenta3a142524aa4b1539a64a55087bf12ffa4b1f94e (diff)
downloadtalos-op-linux-9095a64a9aead653df320e3a6fc70835c15d46e4.tar.gz
talos-op-linux-9095a64a9aead653df320e3a6fc70835c15d46e4.zip
[SCSI] libsas: fix timeout vs completion race
Until we have told the lldd to forget a task a timed out operation can return from the hardware at any time. Since completion frees the task we need to make sure that no tasks run their normal completion handler once eh has decided to manage the task. Similar to ata_scsi_cmd_error_handler() freeze completions to let eh judge the outcome of the race. Task collector mode is problematic because it presents a situation where a task can be timed out and aborted before the lldd has even seen it. For this case we need to guarantee that a task that an lldd has been told to forget does not get queued after the lldd says "never seen it". With sas_scsi_timed_out we achieve this with the ->task_queue_flush mutex, rather than adding more time. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Diffstat (limited to 'drivers/scsi/libsas/sas_scsi_host.c')
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c104
1 files changed, 51 insertions, 53 deletions
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 50db8f971a06..0e3fdba7b510 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -119,9 +119,19 @@ static void sas_end_task(struct scsi_cmnd *sc, struct sas_task *task)
static void sas_scsi_task_done(struct sas_task *task)
{
struct scsi_cmnd *sc = task->uldd_task;
+ struct domain_device *dev = task->dev;
+ struct sas_ha_struct *ha = dev->port->ha;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->done_lock, flags);
+ if (test_bit(SAS_HA_FROZEN, &ha->state))
+ task = NULL;
+ else
+ ASSIGN_SAS_TASK(sc, NULL);
+ spin_unlock_irqrestore(&dev->done_lock, flags);
- if (unlikely(task->task_state_flags & SAS_TASK_STATE_ABORTED)) {
- /* Aborted tasks will be completed by the error handler */
+ if (unlikely(!task)) {
+ /* task will be completed by the error handler */
SAS_DPRINTK("task done but aborted\n");
return;
}
@@ -133,7 +143,6 @@ static void sas_scsi_task_done(struct sas_task *task)
return;
}
- ASSIGN_SAS_TASK(sc, NULL);
sas_end_task(sc, task);
sc->scsi_done(sc);
}
@@ -298,6 +307,7 @@ enum task_disposition {
TASK_IS_DONE,
TASK_IS_ABORTED,
TASK_IS_AT_LU,
+ TASK_IS_NOT_AT_HA,
TASK_IS_NOT_AT_LU,
TASK_ABORT_FAILED,
};
@@ -314,19 +324,18 @@ static enum task_disposition sas_scsi_find_task(struct sas_task *task)
struct scsi_core *core = &ha->core;
struct sas_task *t, *n;
+ mutex_lock(&core->task_queue_flush);
spin_lock_irqsave(&core->task_queue_lock, flags);
- list_for_each_entry_safe(t, n, &core->task_queue, list) {
+ list_for_each_entry_safe(t, n, &core->task_queue, list)
if (task == t) {
list_del_init(&t->list);
- spin_unlock_irqrestore(&core->task_queue_lock,
- flags);
- SAS_DPRINTK("%s: task 0x%p aborted from "
- "task_queue\n",
- __func__, task);
- return TASK_IS_ABORTED;
+ break;
}
- }
spin_unlock_irqrestore(&core->task_queue_lock, flags);
+ mutex_unlock(&core->task_queue_flush);
+
+ if (task == t)
+ return TASK_IS_NOT_AT_HA;
}
for (i = 0; i < 5; i++) {
@@ -499,8 +508,7 @@ try_bus_reset:
}
static int sas_eh_handle_sas_errors(struct Scsi_Host *shost,
- struct list_head *work_q,
- struct list_head *done_q)
+ struct list_head *work_q)
{
struct scsi_cmnd *cmd, *n;
enum task_disposition res = TASK_IS_DONE;
@@ -511,7 +519,16 @@ static int sas_eh_handle_sas_errors(struct Scsi_Host *shost,
Again:
list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
- struct sas_task *task = TO_SAS_TASK(cmd);
+ struct domain_device *dev = cmd_to_domain_dev(cmd);
+ struct sas_task *task;
+
+ spin_lock_irqsave(&dev->done_lock, flags);
+ /* by this point the lldd has either observed
+ * SAS_HA_FROZEN and is leaving the task alone, or has
+ * won the race with eh and decided to complete it
+ */
+ task = TO_SAS_TASK(cmd);
+ spin_unlock_irqrestore(&dev->done_lock, flags);
if (!task)
continue;
@@ -534,6 +551,14 @@ Again:
cmd->eh_eflags = 0;
switch (res) {
+ case TASK_IS_NOT_AT_HA:
+ SAS_DPRINTK("%s: task 0x%p is not at ha: %s\n",
+ __func__, task,
+ cmd->retries ? "retry" : "aborted");
+ if (cmd->retries)
+ cmd->retries--;
+ sas_eh_finish_cmd(cmd);
+ continue;
case TASK_IS_DONE:
SAS_DPRINTK("%s: task 0x%p is done\n", __func__,
task);
@@ -635,7 +660,8 @@ void sas_scsi_recover_host(struct Scsi_Host *shost)
* Deal with commands that still have SAS tasks (i.e. they didn't
* complete via the normal sas_task completion mechanism)
*/
- if (sas_eh_handle_sas_errors(shost, &eh_work_q, &ha->eh_done_q))
+ set_bit(SAS_HA_FROZEN, &ha->state);
+ if (sas_eh_handle_sas_errors(shost, &eh_work_q))
goto out;
/*
@@ -649,6 +675,10 @@ void sas_scsi_recover_host(struct Scsi_Host *shost)
scsi_eh_ready_devs(shost, &eh_work_q, &ha->eh_done_q);
out:
+ clear_bit(SAS_HA_FROZEN, &ha->state);
+ if (ha->lldd_max_execute_num > 1)
+ wake_up_process(ha->core.queue_thread);
+
/* now link into libata eh --- if we have any ata devices */
sas_ata_strategy_handler(shost);
@@ -660,43 +690,7 @@ out:
enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
{
- struct sas_task *task = TO_SAS_TASK(cmd);
- unsigned long flags;
- enum blk_eh_timer_return rtn;
-
- if (sas_ata_timed_out(cmd, task, &rtn))
- return rtn;
-
- if (!task) {
- cmd->request->timeout /= 2;
- SAS_DPRINTK("command 0x%p, task 0x%p, gone: %s\n",
- cmd, task, (cmd->request->timeout ?
- "BLK_EH_RESET_TIMER" : "BLK_EH_NOT_HANDLED"));
- if (!cmd->request->timeout)
- return BLK_EH_NOT_HANDLED;
- return BLK_EH_RESET_TIMER;
- }
-
- spin_lock_irqsave(&task->task_state_lock, flags);
- BUG_ON(task->task_state_flags & SAS_TASK_STATE_ABORTED);
- if (task->task_state_flags & SAS_TASK_STATE_DONE) {
- spin_unlock_irqrestore(&task->task_state_lock, flags);
- SAS_DPRINTK("command 0x%p, task 0x%p, timed out: "
- "BLK_EH_HANDLED\n", cmd, task);
- return BLK_EH_HANDLED;
- }
- if (!(task->task_state_flags & SAS_TASK_AT_INITIATOR)) {
- spin_unlock_irqrestore(&task->task_state_lock, flags);
- SAS_DPRINTK("command 0x%p, task 0x%p, not at initiator: "
- "BLK_EH_RESET_TIMER\n",
- cmd, task);
- return BLK_EH_RESET_TIMER;
- }
- task->task_state_flags |= SAS_TASK_STATE_ABORTED;
- spin_unlock_irqrestore(&task->task_state_lock, flags);
-
- SAS_DPRINTK("command 0x%p, task 0x%p, timed out: BLK_EH_NOT_HANDLED\n",
- cmd, task);
+ scmd_printk(KERN_DEBUG, cmd, "command %p timed out\n", cmd);
return BLK_EH_NOT_HANDLED;
}
@@ -861,9 +855,11 @@ static void sas_queue(struct sas_ha_struct *sas_ha)
int res;
struct sas_internal *i = to_sas_internal(core->shost->transportt);
+ mutex_lock(&core->task_queue_flush);
spin_lock_irqsave(&core->task_queue_lock, flags);
while (!kthread_should_stop() &&
- !list_empty(&core->task_queue)) {
+ !list_empty(&core->task_queue) &&
+ !test_bit(SAS_HA_FROZEN, &sas_ha->state)) {
can_queue = sas_ha->lldd_queue_size - core->task_queue_size;
if (can_queue >= 0) {
@@ -899,6 +895,7 @@ static void sas_queue(struct sas_ha_struct *sas_ha)
}
}
spin_unlock_irqrestore(&core->task_queue_lock, flags);
+ mutex_unlock(&core->task_queue_flush);
}
/**
@@ -925,6 +922,7 @@ int sas_init_queue(struct sas_ha_struct *sas_ha)
struct scsi_core *core = &sas_ha->core;
spin_lock_init(&core->task_queue_lock);
+ mutex_init(&core->task_queue_flush);
core->task_queue_size = 0;
INIT_LIST_HEAD(&core->task_queue);
OpenPOWER on IntegriCloud