diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 49 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 86 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 14 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 150 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 44 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 111 |
8 files changed, 228 insertions, 233 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index effb1309682e..46df030b2c3f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1808,6 +1808,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } @@ -2208,7 +2209,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (!ctrl->opts->discovery_nqn && + if (!(ctrl->opts && ctrl->opts->discovery_nqn) && nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", @@ -3197,40 +3198,28 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) nvme_remove_invalid_namespaces(ctrl, nn); } -static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl) +static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) { size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); __le32 *log; - int error, i; - bool ret = false; + int error; log = kzalloc(log_size, GFP_KERNEL); if (!log) - return false; + return; + /* + * We need to read the log to clear the AEN, but we don't want to rely + * on it for the changed namespace information as userspace could have + * raced with us in reading the log page, which could cause us to miss + * updates. + */ error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); - if (error) { + if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); - goto out_free_log; - } - - if (log[0] == cpu_to_le32(0xffffffff)) - goto out_free_log; - - for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) { - u32 nsid = le32_to_cpu(log[i]); - if (nsid == 0) - break; - dev_info(ctrl->device, "rescanning namespace %d.\n", nsid); - nvme_validate_ns(ctrl, nsid); - } - ret = true; - -out_free_log: kfree(log); - return ret; } static void nvme_scan_work(struct work_struct *work) @@ -3246,9 +3235,8 @@ static void nvme_scan_work(struct work_struct *work) WARN_ON_ONCE(!ctrl->tagset); if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { - if (nvme_scan_changed_ns_log(ctrl)) - goto out_sort_namespaces; dev_info(ctrl->device, "rescanning namespaces.\n"); + nvme_clear_changed_ns_log(ctrl); } if (nvme_identify_ctrl(ctrl, &id)) @@ -3263,7 +3251,6 @@ static void nvme_scan_work(struct work_struct *work) nvme_scan_ns_sequential(ctrl, nn); out_free_id: kfree(id); -out_sort_namespaces: down_write(&ctrl->namespaces_rwsem); list_sort(NULL, &ctrl->namespaces, ns_cmp); up_write(&ctrl->namespaces_rwsem); @@ -3641,16 +3628,6 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) -{ - if (!ctrl->ops->reinit_request) - return 0; - - return blk_mq_tagset_iter(set, set->driver_data, - ctrl->ops->reinit_request); -} -EXPORT_SYMBOL_GPL(nvme_reinit_tagset); - int __init nvme_core_init(void) { int result = -ENOMEM; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index fa32c1216409..903eb4545e26 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -536,67 +536,55 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live, bool is_connected) +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * anything marked for failfast or nvme multipath is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvmf_fail_nonready_command(struct request *rq) { - struct nvme_command *cmd = nvme_req(rq)->cmd; + if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + nvme_req(rq)->status = NVME_SC_ABORT_REQ; + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); - if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected)) - return BLK_STS_OK; +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + struct nvme_request *req = nvme_req(rq); + + /* + * If we are in some state of setup or teardown only allow + * internally generated commands. + */ + if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) + return false; + /* + * Only allow commands on a live queue, except for the connect command, + * which is require to set the queue live in the appropinquate states. + */ switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - case NVME_CTRL_DELETING: - /* - * This is the case of starting a new or deleting an association - * but connectivity was lost before it was fully created or torn - * down. We need to error the commands used to initialize the - * controller so the reconnect can go into a retry attempt. The - * commands should all be marked REQ_FAILFAST_DRIVER, which will - * hit the reject path below. Anything else will be queued while - * the state settles. - */ - if (!is_connected) - break; - - /* - * If queue is live, allow only commands that are internally - * generated pass through. These are commands on the admin - * queue to initialize the controller. This will reject any - * ioctl admin cmds received while initializing. - */ - if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) - return BLK_STS_OK; - - /* - * If the queue is not live, allow only a connect command. This - * will reject any ioctl admin cmd as well as initialization - * commands if the controller reverted the queue to non-live. - */ - if (!queue_live && blk_rq_is_passthrough(rq) && - cmd->common.opcode == nvme_fabrics_command && - cmd->fabrics.fctype == nvme_fabrics_type_connect) - return BLK_STS_OK; + if (req->cmd->common.opcode == nvme_fabrics_command && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) + return true; break; default: break; + case NVME_CTRL_DEAD: + return false; } - /* - * Any other new io is something we're not in a state to send to the - * device. Default action is to busy it and retry it after the - * controller state is recovered. However, anything marked for failfast - * or nvme multipath is immediately failed. Note: commands used to - * initialize the controller will be marked for failfast. - * Note: nvme cli/ioctl commands are marked for failfast. - */ - if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) - return BLK_STS_RESOURCE; - nvme_req(rq)->status = NVME_SC_ABORT_REQ; - return BLK_STS_IOERR; + return queue_live; } -EXPORT_SYMBOL_GPL(nvmf_check_if_ready); +EXPORT_SYMBOL_GPL(__nvmf_check_ready); static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 7491a0bbf711..e1818a27aa2d 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -162,7 +162,17 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, - struct request *rq, bool queue_live, bool is_connected); +blk_status_t nvmf_fail_nonready_command(struct request *rq); +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_ADMIN_ONLY)) + return true; + return __nvmf_check_ready(ctrl, rq, queue_live); +} #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0bad65803271..41d45a1b5c62 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -142,6 +142,7 @@ struct nvme_fc_ctrl { struct nvme_fc_rport *rport; u32 cnum; + bool ioq_live; bool assoc_active; u64 association_id; @@ -1470,21 +1471,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); -static int -nvme_fc_reinit_request(void *data, struct request *rq) -{ - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - - memset(cmdiu, 0, sizeof(*cmdiu)); - cmdiu->scsi_id = NVME_CMD_SCSI_ID; - cmdiu->fc_id = NVME_CMD_FC_ID; - cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); - memset(&op->rsp_iu, 0, sizeof(op->rsp_iu)); - - return 0; -} - static void __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) @@ -1893,6 +1879,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue) */ queue->connection_id = 0; + atomic_set(&queue->csn, 1); } static void @@ -2279,14 +2266,13 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; + bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; blk_status_t ret; - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_FC_Q_LIVE, &queue->flags), - ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE); - if (unlikely(ret)) - return ret; + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); ret = nvme_setup_cmd(ns, rq, sqe); if (ret) @@ -2463,6 +2449,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + ctrl->ioq_live = true; + return 0; out_delete_hw_queues: @@ -2480,7 +2468,7 @@ out_free_tag_set: } static int -nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) +nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; @@ -2500,12 +2488,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - nvme_fc_init_io_queues(ctrl); - - ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); - if (ret) - goto out_free_io_queues; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_io_queues; @@ -2603,8 +2585,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0); - ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, NVME_AQ_DEPTH); if (ret) @@ -2615,8 +2595,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) @@ -2689,10 +2668,10 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) */ if (ctrl->ctrl.queue_count > 1) { - if (ctrl->ctrl.state == NVME_CTRL_NEW) + if (!ctrl->ioq_live) ret = nvme_fc_create_io_queues(ctrl); else - ret = nvme_fc_reinit_io_queues(ctrl); + ret = nvme_fc_recreate_io_queues(ctrl); if (ret) goto out_term_aen_ops; } @@ -2776,8 +2755,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2812,6 +2790,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* re-enable the admin_q so anything new can fast fail */ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + /* resume the io queues so that things will fast fail */ + nvme_start_queues(&ctrl->ctrl); + nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -2826,9 +2807,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); - - /* resume the io queues so that things will fast fail */ - nvme_start_queues(nctrl); } static void @@ -2917,7 +2895,6 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, - .reinit_request = nvme_fc_reinit_request, }; static void @@ -2934,7 +2911,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) nvme_fc_reconnect_or_delete(ctrl, ret); else dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reconnect complete\n", + "NVME-FC{%d}: controller connect complete\n", ctrl->cnum); } @@ -2982,7 +2959,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, { struct nvme_fc_ctrl *ctrl; unsigned long flags; - int ret, idx, retry; + int ret, idx; if (!(rport->remoteport.port_role & (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { @@ -3009,11 +2986,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, } ctrl->ctrl.opts = opts; + ctrl->ctrl.nr_reconnects = 0; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; ctrl->dev = lport->dev; ctrl->cnum = idx; + ctrl->ioq_live = false; ctrl->assoc_active = false; init_waitqueue_head(&ctrl->ioabort_wait); @@ -3032,6 +3011,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; + ctrl->ctrl.cntlid = 0xffff; ret = -ENOMEM; ctrl->queues = kcalloc(ctrl->ctrl.queue_count, @@ -3039,6 +3019,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, if (!ctrl->queues) goto out_free_ida; + nvme_fc_init_queue(ctrl, 0); + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; @@ -3081,62 +3063,24 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - /* - * It's possible that transactions used to create the association - * may fail. Examples: CreateAssociation LS or CreateIOConnection - * LS gets dropped/corrupted/fails; or a frame gets dropped or a - * command times out for one of the actions to init the controller - * (Connect, Get/Set_Property, Set_Features, etc). Many of these - * transport errors (frame drop, LS failure) inherently must kill - * the association. The transport is coded so that any command used - * to create the association (prior to a LIVE state transition - * while NEW or CONNECTING) will fail if it completes in error or - * times out. - * - * As such: as the connect request was mostly likely due to a - * udev event that discovered the remote port, meaning there is - * not an admin or script there to restart if the connect - * request fails, retry the initial connection creation up to - * three times before giving up and declaring failure. - */ - for (retry = 0; retry < 3; retry++) { - ret = nvme_fc_create_association(ctrl); - if (!ret) - break; - } - - if (ret) { - nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&ctrl->ctrl.reset_work); - cancel_delayed_work_sync(&ctrl->connect_work); - - /* couldn't schedule retry - fail out */ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum); - - ctrl->ctrl.opts = NULL; + "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); + goto fail_ctrl; + } - /* initiate nvme ctrl ref counting teardown */ - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_get_ctrl(&ctrl->ctrl); - /* Remove core ctrl ref. */ + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { nvme_put_ctrl(&ctrl->ctrl); - - /* as we're past the point where we transition to the ref - * counting teardown path, if we return a bad pointer here, - * the calling routine, thinking it's prior to the - * transition, will do an rport put. Since the teardown - * path also does a rport put, we do an extra get here to - * so proper order/teardown happens. - */ - nvme_fc_rport_get(rport); - - if (ret > 0) - ret = -EIO; - return ERR_PTR(ret); + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule initial connect\n", + ctrl->cnum); + goto fail_ctrl; } - nvme_get_ctrl(&ctrl->ctrl); + flush_delayed_work(&ctrl->connect_work); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", @@ -3144,6 +3088,30 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return &ctrl->ctrl; +fail_ctrl: + nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ctrl.reset_work); + cancel_delayed_work_sync(&ctrl->connect_work); + + ctrl->ctrl.opts = NULL; + + /* initiate nvme ctrl ref counting teardown */ + nvme_uninit_ctrl(&ctrl->ctrl); + + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + + /* as we're past the point where we transition to the ref + * counting teardown path, if we return a bad pointer here, + * the calling routine, thinking it's prior to the + * transition, will do an rport put. Since the teardown + * path also does a rport put, we do an extra get here to + * so proper order/teardown happens. + */ + nvme_fc_rport_get(rport); + + return ERR_PTR(-EIO); + out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_admin_tag_set: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d7b664ae5923..1ffd3e8b13a1 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -12,6 +12,7 @@ */ #include <linux/moduleparam.h> +#include <trace/events/block.h> #include "nvme.h" static bool multipath = true; @@ -111,6 +112,9 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; + trace_block_bio_remap(bio->bi_disk->queue, bio, + disk_devt(ns->head->disk), + bio->bi_iter.bi_sector); ret = direct_make_request(bio); } else if (!list_empty_careful(&head->list)) { dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 34df07d44f80..0c4a33df3b2f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -170,6 +170,7 @@ struct nvme_ctrl { u64 cap; u32 page_size; u32 max_hw_sectors; + u32 max_segments; u16 oncs; u16 oacs; u16 nssa; @@ -321,7 +322,6 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); - int (*reinit_request)(void *data, struct request *rq); void (*stop_ctrl)(struct nvme_ctrl *ctrl); }; @@ -416,7 +416,6 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fc33804662e7..ba943f211687 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -38,6 +38,13 @@ #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* + * These can be higher, but we need to ensure that any command doesn't + * require an sg allocation that needs more than a page of data. + */ +#define NVME_MAX_KB_SZ 4096 +#define NVME_MAX_SEGS 127 + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -100,6 +107,8 @@ struct nvme_dev { struct nvme_ctrl ctrl; struct completion ioq_wait; + mempool_t *iod_mempool; + /* shadow doorbell buffer support: */ u32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; @@ -477,10 +486,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->use_sgl = nvme_pci_use_sgls(dev, rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { - size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, - iod->use_sgl); - - iod->sg = kmalloc(alloc_size, GFP_ATOMIC); + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); if (!iod->sg) return BLK_STS_RESOURCE; } else { @@ -526,7 +532,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) } if (iod->sg != iod->inline_sg) - kfree(iod->sg); + mempool_free(iod->sg, dev->iod_mempool); } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -2280,6 +2286,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); free_opal_dev(dev->ctrl.opal_dev); + mempool_destroy(dev->iod_mempool); kfree(dev); } @@ -2289,6 +2296,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); + nvme_kill_queues(&dev->ctrl); if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -2333,6 +2341,13 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + dev->ctrl.max_segments = NVME_MAX_SEGS; + result = nvme_init_identify(&dev->ctrl); if (result) goto out; @@ -2405,7 +2420,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); - nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) device_release_driver(&pdev->dev); nvme_put_ctrl(&dev->ctrl); @@ -2509,6 +2523,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int node, result = -ENOMEM; struct nvme_dev *dev; unsigned long quirks = id->driver_data; + size_t alloc_size; node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) @@ -2546,6 +2561,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + /* + * Double check that our mempool alloc size will cover the biggest + * command we support. + */ + alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, + NVME_MAX_SEGS, true); + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + + dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, + mempool_kfree, + (void *) alloc_size, + GFP_KERNEL, node); + if (!dev->iod_mempool) { + result = -ENOMEM; + goto release_pools; + } + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_get_ctrl(&dev->ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2aba03876d84..9544625c0b7d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -560,12 +560,6 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; - if (nvme_rdma_queue_idx(queue) == 0) { - nvme_rdma_free_qe(queue->device->dev, - &queue->ctrl->async_event_sqe, - sizeof(struct nvme_command), DMA_TO_DEVICE); - } - nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); } @@ -698,7 +692,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); set->ops = &nvme_rdma_mq_ops; - set->queue_depth = nctrl->opts->queue_size; + set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ set->numa_node = NUMA_NO_NODE; set->flags = BLK_MQ_F_SHOULD_MERGE; @@ -734,11 +728,12 @@ out: static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_free_queue(&ctrl->queues[0]); } @@ -755,11 +750,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + if (error) + goto out_free_queue; + if (new) { ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); if (IS_ERR(ctrl->ctrl.admin_tagset)) { error = PTR_ERR(ctrl->ctrl.admin_tagset); - goto out_free_queue; + goto out_free_async_qe; } ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); @@ -795,12 +795,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_stop_queue; - error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, - &ctrl->async_event_sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (error) - goto out_stop_queue; - return 0; out_stop_queue: @@ -811,6 +805,9 @@ out_cleanup_queue: out_free_tagset: if (new) nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); +out_free_async_qe: + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -819,7 +816,6 @@ out_free_queue: static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_rdma_stop_io_queues(ctrl); if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); @@ -888,9 +884,9 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) list_del(&ctrl->list); mutex_unlock(&nvme_rdma_ctrl_mutex); - kfree(ctrl->queues); nvmf_free_options(nctrl->opts); free_ctrl: + kfree(ctrl->queues); kfree(ctrl); } @@ -949,6 +945,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", @@ -965,12 +962,14 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, false); } blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, false); @@ -1189,21 +1188,38 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (unlikely(count <= 0)) { - sg_free_table_chained(&req->sg_table, true); - return -EIO; + ret = -EIO; + goto out_free_table; } if (count == 1) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && blk_rq_payload_bytes(rq) <= - nvme_rdma_inline_data_size(queue)) - return nvme_rdma_map_sg_inline(queue, req, c); + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, c); + goto out; + } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) - return nvme_rdma_map_sg_single(queue, req, c); + if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + ret = nvme_rdma_map_sg_single(queue, req, c); + goto out; + } } - return nvme_rdma_map_sg_fr(queue, req, c, count); + ret = nvme_rdma_map_sg_fr(queue, req, c, count); +out: + if (unlikely(ret)) + goto out_unmap_sg; + + return 0; + +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +out_free_table: + sg_free_table_chained(&req->sg_table, true); + return ret; } static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1613,15 +1629,14 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = sqe->data; struct ib_device *dev; + bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; int err; WARN_ON_ONCE(rq->tag < 0); - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true); - if (unlikely(ret)) - return ret; + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, @@ -1720,6 +1735,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, shutdown); @@ -1731,6 +1747,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); @@ -1916,11 +1933,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_free_ctrl; } - ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, - 0 /* no quirks, we're perfect! */); - if (ret) - goto out_free_ctrl; - INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); @@ -1934,14 +1946,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) - goto out_uninit_ctrl; + goto out_free_ctrl; + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_kfree_queues; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) - goto out_kfree_queues; + goto out_uninit_ctrl; /* sanity check icdoff */ if (ctrl->ctrl.icdoff) { @@ -1958,20 +1975,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_remove_admin_queue; } - if (opts->queue_size > ctrl->ctrl.maxcmd) { - /* warn if maxcmd is lower than queue_size */ - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl maxcmd %u, clamping down\n", - opts->queue_size, ctrl->ctrl.maxcmd); - opts->queue_size = ctrl->ctrl.maxcmd; - } - + /* only warn if argument is too large here, will clamp later */ if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - /* warn if sqsize is lower than queue_size */ dev_warn(ctrl->ctrl.device, "queue_size %zu > ctrl sqsize %u, clamping down\n", opts->queue_size, ctrl->ctrl.sqsize + 1); - opts->queue_size = ctrl->ctrl.sqsize + 1; + } + + /* warn if maxcmd is lower than sqsize+1 */ + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; } if (opts->nr_io_queues) { @@ -1997,15 +2013,16 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return &ctrl->ctrl; out_remove_admin_queue: + nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_destroy_admin_queue(ctrl, true); -out_kfree_queues: - kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); out_free_ctrl: kfree(ctrl); return ERR_PTR(ret); |