summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/rdma.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r--drivers/nvme/host/rdma.c215
1 files changed, 103 insertions, 112 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 8d2875b4c56d..5a8388177959 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -43,10 +43,6 @@
#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
-#define NVME_RDMA_MAX_PAGES_PER_MR 512
-
-#define NVME_RDMA_DEF_RECONNECT_DELAY 20
-
/*
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
@@ -58,7 +54,6 @@
struct nvme_rdma_device {
struct ib_device *dev;
struct ib_pd *pd;
- struct ib_mr *mr;
struct kref ref;
struct list_head entry;
};
@@ -77,7 +72,6 @@ struct nvme_rdma_request {
u32 num_sge;
int nents;
bool inline_data;
- bool need_inval;
struct ib_reg_wr reg_wr;
struct ib_cqe reg_cqe;
struct nvme_rdma_queue *queue;
@@ -87,6 +81,8 @@ struct nvme_rdma_request {
enum nvme_rdma_queue_flags {
NVME_RDMA_Q_CONNECTED = (1 << 0),
+ NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1),
+ NVME_RDMA_Q_DELETING = (1 << 2),
};
struct nvme_rdma_queue {
@@ -286,7 +282,7 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int ret = 0;
- if (!req->need_inval)
+ if (!req->mr->need_inval)
goto out;
ib_dereg_mr(req->mr);
@@ -296,9 +292,10 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
if (IS_ERR(req->mr)) {
ret = PTR_ERR(req->mr);
req->mr = NULL;
+ goto out;
}
- req->need_inval = false;
+ req->mr->need_inval = false;
out:
return ret;
@@ -410,10 +407,7 @@ static void nvme_rdma_free_dev(struct kref *ref)
list_del(&ndev->entry);
mutex_unlock(&device_list_mutex);
- if (!register_always)
- ib_dereg_mr(ndev->mr);
ib_dealloc_pd(ndev->pd);
-
kfree(ndev);
}
@@ -446,24 +440,16 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
ndev->dev = cm_id->device;
kref_init(&ndev->ref);
- ndev->pd = ib_alloc_pd(ndev->dev);
+ ndev->pd = ib_alloc_pd(ndev->dev,
+ register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
if (IS_ERR(ndev->pd))
goto out_free_dev;
- if (!register_always) {
- ndev->mr = ib_get_dma_mr(ndev->pd,
- IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE);
- if (IS_ERR(ndev->mr))
- goto out_free_pd;
- }
-
if (!(ndev->dev->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS)) {
dev_err(&ndev->dev->dev,
"Memory registrations not supported.\n");
- goto out_free_mr;
+ goto out_free_pd;
}
list_add(&ndev->entry, &device_list);
@@ -471,9 +457,6 @@ out_unlock:
mutex_unlock(&device_list_mutex);
return ndev;
-out_free_mr:
- if (!register_always)
- ib_dereg_mr(ndev->mr);
out_free_pd:
ib_dealloc_pd(ndev->pd);
out_free_dev:
@@ -485,9 +468,14 @@ out_err:
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
+ struct nvme_rdma_device *dev;
+ struct ib_device *ibdev;
+
+ if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags))
+ return;
+ dev = queue->device;
+ ibdev = dev->dev;
rdma_destroy_qp(queue->cm_id);
ib_free_cq(queue->ib_cq);
@@ -538,6 +526,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
ret = -ENOMEM;
goto out_destroy_qp;
}
+ set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags);
return 0;
@@ -590,11 +579,13 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
goto out_destroy_cm_id;
}
+ clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
return 0;
out_destroy_cm_id:
+ nvme_rdma_destroy_queue_ib(queue);
rdma_destroy_id(queue->cm_id);
return ret;
}
@@ -613,7 +604,7 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
{
- if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
+ if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
return;
nvme_rdma_stop_queue(queue);
nvme_rdma_free_queue(queue);
@@ -645,7 +636,8 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
int i, ret;
for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+ ret = nvme_rdma_init_queue(ctrl, i,
+ ctrl->ctrl.opts->queue_size);
if (ret) {
dev_info(ctrl->ctrl.device,
"failed to initialize i/o queue: %d\n", ret);
@@ -656,7 +648,7 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
return 0;
out_free_queues:
- for (; i >= 1; i--)
+ for (i--; i >= 1; i--)
nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
return ret;
@@ -765,8 +757,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, err_work);
+ int i;
nvme_stop_keep_alive(&ctrl->ctrl);
+
+ for (i = 0; i < ctrl->queue_count; i++)
+ clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags);
+
if (ctrl->queue_count > 1)
nvme_stop_queues(&ctrl->ctrl);
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
@@ -849,7 +846,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
if (!blk_rq_bytes(rq))
return;
- if (req->need_inval) {
+ if (req->mr->need_inval) {
res = nvme_rdma_inv_rkey(queue, req);
if (res < 0) {
dev_err(ctrl->ctrl.device,
@@ -903,7 +900,7 @@ static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
- put_unaligned_le32(queue->device->mr->rkey, sg->key);
+ put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
return 0;
}
@@ -935,7 +932,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
- req->need_inval = true;
+ req->mr->need_inval = true;
sg->addr = cpu_to_le64(req->mr->iova);
put_unaligned_le24(req->mr->length, sg->length);
@@ -958,7 +955,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
req->num_sge = 1;
req->inline_data = false;
- req->need_inval = false;
+ req->mr->need_inval = false;
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -988,7 +985,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
nvme_rdma_queue_idx(queue))
return nvme_rdma_map_sg_inline(queue, req, c);
- if (!register_always)
+ if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
return nvme_rdma_map_sg_single(queue, req, c);
}
@@ -1145,7 +1142,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
wc->ex.invalidate_rkey == req->mr->rkey)
- req->need_inval = false;
+ req->mr->need_inval = false;
blk_mq_complete_request(rq, status);
@@ -1278,8 +1275,22 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
- priv.hrqsize = cpu_to_le16(queue->queue_size);
- priv.hsqsize = cpu_to_le16(queue->queue_size);
+ /*
+ * set the admin queue depth to the minimum size
+ * specified by the Fabrics standard.
+ */
+ if (priv.qid == 0) {
+ priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH);
+ priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1);
+ } else {
+ /*
+ * current interpretation of the fabrics spec
+ * is at minimum you make hrqsize sqsize+1, or a
+ * 1's based representation of sqsize.
+ */
+ priv.hrqsize = cpu_to_le16(queue->queue_size);
+ priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
+ }
ret = rdma_connect(queue->cm_id, &param);
if (ret) {
@@ -1295,58 +1306,6 @@ out_destroy_queue_ib:
return ret;
}
-/**
- * nvme_rdma_device_unplug() - Handle RDMA device unplug
- * @queue: Queue that owns the cm_id that caught the event
- *
- * DEVICE_REMOVAL event notifies us that the RDMA device is about
- * to unplug so we should take care of destroying our RDMA resources.
- * This event will be generated for each allocated cm_id.
- *
- * In our case, the RDMA resources are managed per controller and not
- * only per queue. So the way we handle this is we trigger an implicit
- * controller deletion upon the first DEVICE_REMOVAL event we see, and
- * hold the event inflight until the controller deletion is completed.
- *
- * One exception that we need to handle is the destruction of the cm_id
- * that caught the event. Since we hold the callout until the controller
- * deletion is completed, we'll deadlock if the controller deletion will
- * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
- * of destroying this queue before-hand, destroy the queue resources,
- * then queue the controller deletion which won't destroy this queue and
- * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
- */
-static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
-{
- struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- int ret;
-
- /* Own the controller deletion */
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
- return 0;
-
- dev_warn(ctrl->ctrl.device,
- "Got rdma device removal event, deleting ctrl\n");
-
- /* Get rid of reconnect work if its running */
- cancel_delayed_work_sync(&ctrl->reconnect_work);
-
- /* Disable the queue so ctrl delete won't free it */
- if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
- /* Free this queue ourselves */
- nvme_rdma_stop_queue(queue);
- nvme_rdma_destroy_queue_ib(queue);
-
- /* Return non-zero so the cm_id will destroy implicitly */
- ret = 1;
- }
-
- /* Queue controller deletion */
- queue_work(nvme_rdma_wq, &ctrl->delete_work);
- flush_work(&ctrl->delete_work);
- return ret;
-}
-
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *ev)
{
@@ -1388,8 +1347,8 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
nvme_rdma_error_recovery(queue->ctrl);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
- /* return 1 means impliciy CM ID destroy */
- return nvme_rdma_device_unplug(queue);
+ /* device removal is handled via the ib_client API */
+ break;
default:
dev_err(queue->ctrl->ctrl.device,
"Unexpected RDMA CM event (%d)\n", ev->event);
@@ -1461,7 +1420,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
flush = true;
ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
- req->need_inval ? &req->reg_wr.wr : NULL, flush);
+ req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
if (ret) {
nvme_rdma_unmap_data(queue, rq);
goto err;
@@ -1521,7 +1480,6 @@ static void nvme_rdma_complete_rq(struct request *rq)
static struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
- .map_queue = blk_mq_map_queue,
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
.reinit_request = nvme_rdma_reinit_request,
@@ -1533,7 +1491,6 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
- .map_queue = blk_mq_map_queue,
.init_request = nvme_rdma_init_admin_request,
.exit_request = nvme_rdma_exit_admin_request,
.reinit_request = nvme_rdma_reinit_request,
@@ -1690,15 +1647,19 @@ static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- int ret;
+ int ret = 0;
+ /*
+ * Keep a reference until all work is flushed since
+ * __nvme_rdma_del_ctrl can free the ctrl mem
+ */
+ if (!kref_get_unless_zero(&ctrl->ctrl.kref))
+ return -EBUSY;
ret = __nvme_rdma_del_ctrl(ctrl);
- if (ret)
- return ret;
-
- flush_work(&ctrl->delete_work);
-
- return 0;
+ if (!ret)
+ flush_work(&ctrl->delete_work);
+ nvme_put_ctrl(&ctrl->ctrl);
+ return ret;
}
static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
@@ -1816,7 +1777,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
ctrl->tag_set.ops = &nvme_rdma_mq_ops;
- ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize;
+ ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
ctrl->tag_set.numa_node = NUMA_NO_NODE;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -1914,7 +1875,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
spin_lock_init(&ctrl->lock);
ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
- ctrl->ctrl.sqsize = opts->queue_size;
+ ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
ret = -ENOMEM;
@@ -1995,27 +1956,57 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.create_ctrl = nvme_rdma_create_ctrl,
};
+static void nvme_rdma_add_one(struct ib_device *ib_device)
+{
+}
+
+static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
+{
+ struct nvme_rdma_ctrl *ctrl;
+
+ /* Delete all controllers using this device */
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+ if (ctrl->device->dev != ib_device)
+ continue;
+ dev_info(ctrl->ctrl.device,
+ "Removing ctrl: NQN \"%s\", addr %pISp\n",
+ ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+ __nvme_rdma_del_ctrl(ctrl);
+ }
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+ flush_workqueue(nvme_rdma_wq);
+}
+
+static struct ib_client nvme_rdma_ib_client = {
+ .name = "nvme_rdma",
+ .add = nvme_rdma_add_one,
+ .remove = nvme_rdma_remove_one
+};
+
static int __init nvme_rdma_init_module(void)
{
+ int ret;
+
nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
if (!nvme_rdma_wq)
return -ENOMEM;
+ ret = ib_register_client(&nvme_rdma_ib_client);
+ if (ret) {
+ destroy_workqueue(nvme_rdma_wq);
+ return ret;
+ }
+
nvmf_register_transport(&nvme_rdma_transport);
return 0;
}
static void __exit nvme_rdma_cleanup_module(void)
{
- struct nvme_rdma_ctrl *ctrl;
-
nvmf_unregister_transport(&nvme_rdma_transport);
-
- mutex_lock(&nvme_rdma_ctrl_mutex);
- list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
- __nvme_rdma_del_ctrl(ctrl);
- mutex_unlock(&nvme_rdma_ctrl_mutex);
-
+ ib_unregister_client(&nvme_rdma_ib_client);
destroy_workqueue(nvme_rdma_wq);
}
OpenPOWER on IntegriCloud