summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/multipath.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2018-05-14 08:48:54 +0200
committerChristoph Hellwig <hch@lst.de>2018-07-27 19:12:08 +0200
commit0d0b660f214dc4905db7b6bc998bad0c16dfb1ba (patch)
tree4c3bca154a6e7a2ae00adcfd8d2a94bd6ce64e12 /drivers/nvme/host/multipath.c
parent8decf5d5b9f3f72b802a017b0b035f7db0592acf (diff)
downloadtalos-op-linux-0d0b660f214dc4905db7b6bc998bad0c16dfb1ba.tar.gz
talos-op-linux-0d0b660f214dc4905db7b6bc998bad0c16dfb1ba.zip
nvme: add ANA support
Add support for Asynchronous Namespace Access as specified in NVMe 1.3 TP 4004. With ANA each namespace attached to a controller belongs to an ANA group that describes the characteristics of accessing the namespaces through this controller. In the optimized and non-optimized states namespaces can be accessed regularly, although in a multi-pathing environment we should always prefer to access a namespace through a controller where an optimized relationship exists. Namespaces in Inaccessible, Permanent-Loss or Change state for a given controller should not be accessed. The states are updated through reading the ANA log page, which is read once during controller initialization, whenever the ANA change notice AEN is received, or when one of the ANA specific status codes that signal a state change is received on a command. The ANA state is kept in the nvme_ns structure, which makes the checks in the fast path very simple. Updating the ANA state when reading the log page is also very simple, the only downside is that finding the initial ANA state when scanning for namespaces is a bit cumbersome. The gendisk for a ns_head is only registered once a live path for it exists. Without that the kernel would hang during partition scanning. Includes fixes and improvements from Hannes Reinecke. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Keith Busch <keith.busch@intel.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Diffstat (limited to 'drivers/nvme/host/multipath.c')
-rw-r--r--drivers/nvme/host/multipath.c342
1 files changed, 327 insertions, 15 deletions
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 348aa405b641..c643872f8dac 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 Christoph Hellwig.
+ * Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return multipath && (ctrl->subsys->cmic & (1 << 3));
+}
+
/*
* If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
+ u16 status = nvme_req(req)->status;
unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,7 +58,34 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
- nvme_reset_ctrl(ns->ctrl);
+ switch (status & 0x7ff) {
+ case NVME_SC_ANA_TRANSITION:
+ case NVME_SC_ANA_INACCESSIBLE:
+ case NVME_SC_ANA_PERSISTENT_LOSS:
+ /*
+ * If we got back an ANA error we know the controller is alive,
+ * but not ready to serve this namespaces. The spec suggests
+ * we should update our general state here, but due to the fact
+ * that the admin and I/O queues are not serialized that is
+ * fundamentally racy. So instead just clear the current path,
+ * mark the the path as pending and kick of a re-read of the ANA
+ * log page ASAP.
+ */
+ nvme_mpath_clear_current_path(ns);
+ if (ns->ctrl->ana_log_buf) {
+ set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+ queue_work(nvme_wq, &ns->ctrl->ana_work);
+ }
+ break;
+ default:
+ /*
+ * Reset the controller for any non-ANA error as we don't know
+ * what caused the error.
+ */
+ nvme_reset_ctrl(ns->ctrl);
+ break;
+ }
+
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -68,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem);
}
+static const char *nvme_ana_state_names[] = {
+ [0] = "invalid state",
+ [NVME_ANA_OPTIMIZED] = "optimized",
+ [NVME_ANA_NONOPTIMIZED] = "non-optimized",
+ [NVME_ANA_INACCESSIBLE] = "inaccessible",
+ [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
+ [NVME_ANA_CHANGE] = "change",
+};
+
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
- struct nvme_ns *ns;
+ struct nvme_ns *ns, *fallback = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ if (ns->ctrl->state != NVME_CTRL_LIVE ||
+ test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ continue;
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
rcu_assign_pointer(head->current_path, ns);
return ns;
+ case NVME_ANA_NONOPTIMIZED:
+ fallback = ns;
+ break;
+ default:
+ break;
}
}
- return NULL;
+ if (fallback)
+ rcu_assign_pointer(head->current_path, fallback);
+ return fallback;
+}
+
+static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
+{
+ return ns->ctrl->state == NVME_CTRL_LIVE &&
+ ns->ana_state == NVME_ANA_OPTIMIZED;
}
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
- if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+ if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head);
return ns;
}
@@ -135,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu);
- if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+ if (likely(ns && nvme_path_is_optimized(ns)))
found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx);
return found;
@@ -169,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
struct request_queue *q;
bool vwc = false;
+ mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -213,29 +273,232 @@ out:
return -ENOMEM;
}
-void nvme_mpath_add_disk(struct nvme_ns_head *head)
+static void nvme_mpath_set_live(struct nvme_ns *ns)
{
+ struct nvme_ns_head *head = ns->head;
+
+ lockdep_assert_held(&ns->head->lock);
+
if (!head->disk)
return;
- mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group))
- pr_warn("%s: failed to create sysfs group for identification\n",
- head->disk->disk_name);
+ dev_warn(&head->subsys->dev,
+ "failed to create id group.\n");
+ }
+
+ kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
+ int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
+ void *))
+{
+ void *base = ctrl->ana_log_buf;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr);
+ int error, i;
+
+ lockdep_assert_held(&ctrl->ana_lock);
+
+ for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
+ struct nvme_ana_group_desc *desc = base + offset;
+ u32 nr_nsids = le32_to_cpu(desc->nnsids);
+ size_t nsid_buf_size = nr_nsids * sizeof(__le32);
+
+ if (WARN_ON_ONCE(desc->grpid == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
+ return -EINVAL;
+
+ offset += sizeof(*desc);
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
+ return -EINVAL;
+
+ error = cb(ctrl, desc, data);
+ if (error)
+ return error;
+
+ offset += nsid_buf_size;
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline bool nvme_state_is_live(enum nvme_ana_state state)
+{
+ return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
+}
+
+static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
+ struct nvme_ns *ns)
+{
+ enum nvme_ana_state old;
+
+ mutex_lock(&ns->head->lock);
+ old = ns->ana_state;
+ ns->ana_grpid = le32_to_cpu(desc->grpid);
+ ns->ana_state = desc->state;
+ clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
+
+ if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
+}
+
+static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
+ unsigned *nr_change_groups = data;
+ struct nvme_ns *ns;
+
+ dev_info(ctrl->device, "ANA group %d: %s.\n",
+ le32_to_cpu(desc->grpid),
+ nvme_ana_state_names[desc->state]);
+
+ if (desc->state == NVME_ANA_CHANGE)
+ (*nr_change_groups)++;
+
+ if (!nr_nsids)
+ return 0;
+
+ down_write(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ continue;
+ nvme_update_ns_ana_state(desc, ns);
+ if (++n == nr_nsids)
+ break;
+ }
+ up_write(&ctrl->namespaces_rwsem);
+ WARN_ON_ONCE(n < nr_nsids);
+ return 0;
+}
+
+static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+{
+ u32 nr_change_groups = 0;
+ int error;
+
+ mutex_lock(&ctrl->ana_lock);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
+ groups_only ? NVME_ANA_LOG_RGO : 0,
+ ctrl->ana_log_buf, ctrl->ana_log_size, 0);
+ if (error) {
+ dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
+ goto out_unlock;
+ }
+
+ error = nvme_parse_ana_log(ctrl, &nr_change_groups,
+ nvme_update_ana_state);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * In theory we should have an ANATT timer per group as they might enter
+ * the change state at different times. But that is a lot of overhead
+ * just to protect against a target that keeps entering new changes
+ * states while never finishing previous ones. But we'll still
+ * eventually time out once all groups are in change state, so this
+ * isn't a big deal.
+ *
+ * We also double the ANATT value to provide some slack for transports
+ * or AEN processing overhead.
+ */
+ if (nr_change_groups)
+ mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
+ else
+ del_timer_sync(&ctrl->anatt_timer);
+out_unlock:
+ mutex_unlock(&ctrl->ana_lock);
+ return error;
+}
+
+static void nvme_ana_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+
+ nvme_read_ana_log(ctrl, false);
+}
+
+static void nvme_anatt_timeout(struct timer_list *t)
+{
+ struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
+
+ dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
+ nvme_reset_ctrl(ctrl);
+}
+
+void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_ctrl_use_ana(ctrl))
+ return;
+ del_timer_sync(&ctrl->anatt_timer);
+ cancel_work_sync(&ctrl->ana_work);
+}
+
+static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
+}
+DEVICE_ATTR_RO(ana_grpid);
+
+static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
+}
+DEVICE_ATTR_RO(ana_state);
+
+static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ struct nvme_ns *ns = data;
+
+ if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
+ nvme_update_ns_ana_state(desc, ns);
+ return -ENXIO; /* just break out of the loop */
+ }
+
+ return 0;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ if (nvme_ctrl_use_ana(ns->ctrl)) {
+ mutex_lock(&ns->ctrl->ana_lock);
+ ns->ana_grpid = le32_to_cpu(id->anagrpid);
+ nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
+ mutex_unlock(&ns->ctrl->ana_lock);
+ } else {
+ mutex_lock(&ns->head->lock);
+ ns->ana_state = NVME_ANA_OPTIMIZED;
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
}
- mutex_unlock(&head->subsys->lock);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
- sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
- &nvme_ns_id_attr_group);
- del_gendisk(head->disk);
+ if (head->disk->flags & GENHD_FL_UP) {
+ sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+ &nvme_ns_id_attr_group);
+ del_gendisk(head->disk);
+ }
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
@@ -243,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
+
+int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ int error;
+
+ if (!nvme_ctrl_use_ana(ctrl))
+ return 0;
+
+ ctrl->anacap = id->anacap;
+ ctrl->anatt = id->anatt;
+ ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
+ ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
+
+ mutex_init(&ctrl->ana_lock);
+ timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
+ ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
+ ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
+ if (!(ctrl->anacap & (1 << 6)))
+ ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
+
+ if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
+ dev_err(ctrl->device,
+ "ANA log page size (%zd) larger than MDTS (%d).\n",
+ ctrl->ana_log_size,
+ ctrl->max_hw_sectors << SECTOR_SHIFT);
+ dev_err(ctrl->device, "disabling ANA support.\n");
+ return 0;
+ }
+
+ INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
+ if (!ctrl->ana_log_buf)
+ goto out;
+
+ error = nvme_read_ana_log(ctrl, true);
+ if (error)
+ goto out_free_ana_log_buf;
+ return 0;
+out_free_ana_log_buf:
+ kfree(ctrl->ana_log_buf);
+out:
+ return -ENOMEM;
+}
+
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+ kfree(ctrl->ana_log_buf);
+}
+
OpenPOWER on IntegriCloud