diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 502 |
1 files changed, 366 insertions, 136 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c258a1ce4b28..ada59df642d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,12 +22,12 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include "nvme.h" #include "fabrics.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + #define NVME_MINORS (1U << MINORBITS) unsigned int admin_timeout = 60; @@ -66,8 +66,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); * nvme_reset_wq - hosts nvme reset works * nvme_delete_wq - hosts nvme delete works * - * nvme_wq will host works such are scan, aen handling, fw activation, - * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq + * nvme_wq will host works such as scan, aen handling, fw activation, + * keep-alive, periodic reconnects etc. nvme_reset_wq * runs reset works which also flush works hosted on nvme_wq for * serialization purposes. nvme_delete_wq host controller deletion * works which flush reset works for serialization. @@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq); struct workqueue_struct *nvme_delete_wq; EXPORT_SYMBOL_GPL(nvme_delete_wq); -static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -103,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) */ if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); + /* + * Revalidate after unblocking dispatchers that may be holding bd_butex + */ + revalidate_disk(ns->disk); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) @@ -114,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl) /* * Only new queue scan work when admin and IO queues are both alive */ - if (ctrl->state == NVME_CTRL_LIVE) + if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) queue_work(nvme_wq, &ctrl->scan_work); } +/* + * Use this function to proceed with scheduling reset_work for a controller + * that had previously been set to the resetting state. This is intended for + * code paths that can't be interrupted by other reset attempts. A hot removal + * may prevent this from succeeding. + */ +int nvme_try_sched_reset(struct nvme_ctrl *ctrl) +{ + if (ctrl->state != NVME_CTRL_RESETTING) + return -EBUSY; + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_try_sched_reset); + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -135,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) ret = nvme_reset_ctrl(ctrl); if (!ret) { flush_work(&ctrl->reset_work); - if (ctrl->state != NVME_CTRL_LIVE && - ctrl->state != NVME_CTRL_ADMIN_ONLY) + if (ctrl->state != NVME_CTRL_LIVE) ret = -ENETRESET; } @@ -197,14 +214,16 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns) return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); } -static blk_status_t nvme_error_status(struct request *req) +static blk_status_t nvme_error_status(u16 status) { - switch (nvme_req(req)->status & 0x7ff) { + switch (status & 0x7ff) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: return BLK_STS_NOSPC; case NVME_SC_LBA_RANGE: + case NVME_SC_CMD_INTERRUPTED: + case NVME_SC_NS_NOT_READY: return BLK_STS_TARGET; case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_ONCS_NOT_SUPPORTED: @@ -226,6 +245,8 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_PROTECTION; case NVME_SC_RESERVATION_CONFLICT: return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } @@ -260,10 +281,12 @@ static void nvme_retry_req(struct request *req) void nvme_complete_rq(struct request *req) { - blk_status_t status = nvme_error_status(req); + blk_status_t status = nvme_error_status(nvme_req(req)->status); trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + if (nvme_req(req)->ctrl->kas) nvme_req(req)->ctrl->comp_seen = true; @@ -279,6 +302,8 @@ void nvme_complete_rq(struct request *req) return; } } + + nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -288,8 +313,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); - nvme_req(req)->status = NVME_SC_ABORT_REQ; - blk_mq_complete_request_sync(req); + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + + nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -305,15 +334,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, old_state = ctrl->state; switch (new_state) { - case NVME_CTRL_ADMIN_ONLY: - switch (old_state) { - case NVME_CTRL_CONNECTING: - changed = true; - /* FALLTHRU */ - default: - break; - } - break; case NVME_CTRL_LIVE: switch (old_state) { case NVME_CTRL_NEW: @@ -329,7 +349,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: changed = true; /* FALLTHRU */ default: @@ -349,7 +368,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: switch (old_state) { case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; @@ -371,8 +389,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } - if (changed) + if (changed) { ctrl->state = new_state; + wake_up_all(&ctrl->state_wq); + } spin_unlock_irqrestore(&ctrl->lock, flags); if (changed && ctrl->state == NVME_CTRL_LIVE) @@ -381,6 +401,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); +/* + * Returns true for sink states that can't ever transition back to live. + */ +static bool nvme_state_terminal(struct nvme_ctrl *ctrl) +{ + switch (ctrl->state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + return false; + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + return true; + default: + WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); + return true; + } +} + +/* + * Waits for the controller state to be resetting, or returns false if it is + * not possible to ever transition to that state. + */ +bool nvme_wait_reset(struct nvme_ctrl *ctrl) +{ + wait_event(ctrl->state_wq, + nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || + nvme_state_terminal(ctrl)); + return ctrl->state == NVME_CTRL_RESETTING; +} +EXPORT_SYMBOL_GPL(nvme_wait_reset); + static void nvme_free_ns_head(struct kref *ref) { struct nvme_ns_head *head = @@ -562,8 +615,14 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_dsm_range *range; struct bio *bio; - range = kmalloc_array(segments, sizeof(*range), - GFP_ATOMIC | __GFP_NOWARN); + /* + * Some devices do not consider the DSM 'Number of Ranges' field when + * determining how much data to DMA. Always allocate memory for maximum + * number of segments to prevent device reading beyond end of buffer. + */ + static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; + + range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); if (!range) { /* * If we fail allocation our range, fallback to the controller @@ -577,7 +636,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } __rq_for_each_bio(bio, req) { - u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; if (n < segments) { @@ -603,7 +662,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, req->special_vec.bv_page = virt_to_page(range); req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = sizeof(*range) * segments; + req->special_vec.bv_len = alloc_size; req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_STS_OK; @@ -618,7 +677,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); cmnd->write_zeroes.slba = - cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->write_zeroes.control = 0; @@ -642,7 +701,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) @@ -659,8 +718,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; - } else if (req_op(req) == REQ_OP_WRITE) { - t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -683,13 +740,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - nvme_req(req)->status == 0) { - struct nvme_ns *ns = req->rq_disk->private_data; - - t10_pi_complete(req, ns->pi_type, - blk_rq_bytes(req) >> ns->lba_shift); - } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ns *ns = req->rq_disk->private_data; struct page *page = req->special_vec.bv_page; @@ -849,7 +899,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u32 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -890,7 +940,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, else ret = nvme_req(req)->status; if (result) - *result = le32_to_cpu(nvme_req(req)->result.u32); + *result = le64_to_cpu(nvme_req(req)->result.u64); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; @@ -926,7 +976,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) @@ -956,7 +1006,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); return; } @@ -973,7 +1023,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) @@ -1088,10 +1138,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n NVME_IDENTIFY_DATA_SIZE); } -static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns **id) { - struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -1100,18 +1149,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (!id) - return NULL; + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; - error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(id); - return NULL; + kfree(*id); } - return id; + return error; } static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, @@ -1180,7 +1228,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ + NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -1195,6 +1244,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); } static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -1304,8 +1355,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl) if (ns->disk && nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); up_read(&ctrl->namespaces_rwsem); - - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); } static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) @@ -1321,6 +1370,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) @@ -1336,6 +1386,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_command c; unsigned timeout = 0; u32 effects; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + (void __user *)(uintptr_t)cmd.metadata, + cmd.metadata_len, 0, &result, timeout); + nvme_passthru_end(ctrl, effects); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1406,6 +1504,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, + struct nvme_ns_head *head, + int srcu_idx) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + ret = nvme_user_cmd(ctrl, NULL, argp); + break; + case NVME_IOCTL_ADMIN64_CMD: + ret = nvme_user_cmd64(ctrl, NULL, argp); + break; + default: + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + break; + } + nvme_put_ctrl(ctrl); + return ret; +} + static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1423,20 +1556,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, * seperately and drop the ns SRCU reference early. This avoids a * deadlock when deleting namespaces using the passthrough interface. */ - if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { - struct nvme_ctrl *ctrl = ns->ctrl; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - if (cmd == NVME_IOCTL_ADMIN_CMD) - ret = nvme_user_cmd(ctrl, NULL, argp); - else - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - - nvme_put_ctrl(ctrl); - return ret; - } + if (is_ctrl_ioctl(cmd)) + return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); switch (cmd) { case NVME_IOCTL_ID: @@ -1449,6 +1570,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, case NVME_IOCTL_SUBMIT_IO: ret = nvme_submit_io(ns, argp); break; + case NVME_IOCTL_IO64_CMD: + ret = nvme_user_cmd64(ns->ctrl, ns, argp); + break; default: if (ns->ndev) ret = nvme_nvm_ioctl(ns, cmd, arg); @@ -1533,7 +1657,7 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) static void nvme_set_chunk_size(struct nvme_ns *ns) { - u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); + u32 chunk_size = nvme_lba_to_sect(ns, ns->noiob); blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } @@ -1570,8 +1694,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { - u32 max_sectors; - unsigned short bs = 1 << ns->lba_shift; + u64 max_blocks; if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) @@ -1587,16 +1710,19 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) * nvme_init_identify() if available. */ if (ns->ctrl->max_hw_sectors == UINT_MAX) - max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; + max_blocks = (u64)USHRT_MAX + 1; else - max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; + max_blocks = ns->ctrl->max_hw_sectors + 1; - blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); + blk_queue_max_write_zeroes_sectors(disk->queue, + nvme_lba_to_sect(ns, max_blocks)); } -static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, +static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + int ret = 0; + memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) @@ -1607,10 +1733,14 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, ids)) + ret = nvme_identify_ns_descs(ctrl, nsid, ids); + if (ret) dev_warn(ctrl->device, - "%s: Identify Descriptors failed\n", __func__); + "Identify Descriptors failed (%d)\n", ret); + if (ret > 0) + ret = 0; } + return ret; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) @@ -1630,7 +1760,7 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { - sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); + sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt; @@ -1738,25 +1868,37 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->head->ns_id); - if (!id) - return -ENODEV; + ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); + if (ret) + goto out; if (id->ncap == 0) { ret = -ENODEV; - goto out; + goto free_id; } __nvme_revalidate_disk(disk, id); - nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (ret) + goto free_id; + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; } -out: +free_id: kfree(id); +out: + /* + * Only fail the function if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + */ + if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) + ret = 0; + else if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -1952,7 +2094,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) { int ret; @@ -1966,20 +2108,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, cap, false); + return nvme_wait_ready(ctrl, ctrl->cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + unsigned dev_page_min, page_shift = 12; int ret; + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + if (page_shift < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", @@ -1998,7 +2147,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, cap, true); + return nvme_wait_ready(ctrl, ctrl->cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2257,6 +2406,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = { .vid = 0x1179, .mn = "THNSF5256GPUK TOSHIBA", .quirks = NVME_QUIRK_NO_APST, + }, + { + /* + * This LiteON CL1-3D*-Q11 firmware version has a race + * condition associated with actions related to suspend to idle + * LiteON has resolved the problem in future firmware + */ + .vid = 0x14a4, + .fr = "22301111", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, } }; @@ -2322,7 +2481,8 @@ static void nvme_release_subsystem(struct device *dev) struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + if (subsys->instance >= 0) + ida_simple_remove(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -2351,6 +2511,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) lockdep_assert_held(&nvme_subsystems_lock); + /* + * Fail matches for discovery subsystems. This results + * in each discovery controller bound to a unique subsystem. + * This avoids issues with validating controller values + * that can only be true when there is a single unique subsystem. + * There may be multiple and completely independent entities + * that provide discovery controllers. + */ + if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) + return NULL; + list_for_each_entry(subsys, &nvme_subsystems, entry) { if (strcmp(subsys->subnqn, subsysnqn)) continue; @@ -2451,12 +2622,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); if (!subsys) return -ENOMEM; - ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); - if (ret < 0) { - kfree(subsys); - return ret; - } - subsys->instance = ret; + + subsys->instance = -1; mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); @@ -2475,7 +2642,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; - dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); device_initialize(&subsys->dev); mutex_lock(&nvme_subsystems_lock); @@ -2500,13 +2667,16 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) list_add_tail(&subsys->entry, &nvme_subsystems); } - if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, - dev_name(ctrl->device))) { + ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device)); + if (ret) { dev_err(ctrl->device, "failed to create sysfs link from subsystem.\n"); goto out_put_subsystem; } + if (!found) + subsys->instance = ctrl->instance; ctrl->subsys = subsys; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); mutex_unlock(&nvme_subsystems_lock); @@ -2564,7 +2734,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - u64 cap; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; @@ -2574,16 +2743,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } - page_shift = NVME_CAP_MPSMIN(cap) + 12; + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(cap); + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -2597,6 +2761,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } + if (!(ctrl->ops->flags & NVME_F_FABRICS)) + ctrl->cntlid = le16_to_cpu(id->cntlid); + if (!ctrl->identified) { int i; @@ -2631,6 +2798,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->oncs = le16_to_cpu(id->oncs); ctrl->mtfa = le16_to_cpu(id->mtfa); ctrl->oaes = le32_to_cpu(id->oaes); + ctrl->wctemp = le16_to_cpu(id->wctemp); + ctrl->cctemp = le16_to_cpu(id->cctemp); + atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; if (id->mdts) @@ -2686,6 +2856,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) * admin connect */ if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { + dev_err(ctrl->device, + "Mismatching cntlid: Connect %u vs Identify " + "%u, rejecting\n", + ctrl->cntlid, le16_to_cpu(id->cntlid)); ret = -EINVAL; goto out_free; } @@ -2697,7 +2871,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } } else { - ctrl->cntlid = le16_to_cpu(id->cntlid); ctrl->hmpre = le32_to_cpu(id->hmpre); ctrl->hmmin = le32_to_cpu(id->hmmin); ctrl->hmminds = le32_to_cpu(id->hmminds); @@ -2731,6 +2904,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + if (!ctrl->identified) + nvme_hwmon_init(ctrl); + ctrl->identified = true; return 0; @@ -2748,7 +2924,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file) switch (ctrl->state) { case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: break; default: return -EWOULDBLOCK; @@ -2800,6 +2975,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: @@ -2819,7 +2996,7 @@ static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = nvme_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; static ssize_t nvme_sysfs_reset(struct device *dev, @@ -3007,6 +3184,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); +nvme_show_int_function(queue_count); +nvme_show_int_function(sqsize); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3038,7 +3217,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, static const char *const state_name[] = { [NVME_CTRL_NEW] = "new", [NVME_CTRL_LIVE] = "live", - [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", @@ -3087,6 +3265,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_address.attr, &dev_attr_state.attr, &dev_attr_numa_node.attr, + &dev_attr_queue_count.attr, + &dev_attr_sqsize.attr, NULL }; @@ -3172,7 +3352,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = nsid; kref_init(&head->ref); - nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + if (ret) + goto out_cleanup_srcu; ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { @@ -3197,6 +3379,8 @@ out_ida_remove: out_free_head: kfree(head); out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ERR_PTR(ret); } @@ -3220,7 +3404,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, } else { struct nvme_ns_ids ids; - nvme_report_ns_ids(ctrl, nsid, id, &ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out_unlock; + if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3235,6 +3422,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3326,11 +3515,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - id = nvme_identify_ns(ctrl, nsid); - if (!id) { - ret = -EIO; + ret = nvme_identify_ns(ctrl, nsid, &id); + if (ret) goto out_free_queue; - } if (id->ncap == 0) { ret = -EINVAL; @@ -3392,6 +3579,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3538,11 +3727,10 @@ static void nvme_scan_work(struct work_struct *work) struct nvme_id_ctrl *id; unsigned nn; - if (ctrl->state != NVME_CTRL_LIVE) + /* No tagset on a live ctrl means IO queues could not created */ + if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) return; - WARN_ON_ONCE(!ctrl->tagset); - if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { dev_info(ctrl->device, "rescanning namespaces.\n"); nvme_clear_changed_ns_log(ctrl); @@ -3605,6 +3793,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); + if (ret) + return ret; + + if (opts) { + ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_TRSVCID=%s", + opts->trsvcid ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", + opts->host_traddr ?: "none"); + } + return ret; +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -3652,7 +3867,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); @@ -3676,13 +3891,13 @@ static void nvme_fw_act_work(struct work_struct *work) if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, "Fw activation timeout, reset controller\n"); - nvme_reset_ctrl(ctrl); - break; + nvme_try_sched_reset(ctrl); + return; } msleep(100); } - if (ctrl->state != NVME_CTRL_LIVE) + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; nvme_start_queues(ctrl); @@ -3702,7 +3917,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - queue_work(nvme_wq, &ctrl->fw_act_work); + /* + * We are (ab)using the RESETTING state to prevent subsequent + * recovery actions from interfering with the controller's + * firmware activation. + */ + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + queue_work(nvme_wq, &ctrl->fw_act_work); break; #ifdef CONFIG_NVME_MULTIPATH case NVME_AER_NOTICE_ANA: @@ -3711,6 +3932,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) queue_work(nvme_wq, &ctrl->ana_work); break; #endif + case NVME_AER_NOTICE_DISC_CHANGED: + ctrl->aen_result = result; + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3757,10 +3981,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->kato) nvme_start_keep_alive(ctrl); + nvme_enable_aen(ctrl); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_enable_aen(ctrl); - queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } @@ -3780,7 +4004,9 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + if (subsys && ctrl->instance != subsys->instance) + ida_simple_remove(&nvme_instance_ida, ctrl->instance); + kfree(ctrl->effects); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -3820,6 +4046,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + init_waitqueue_head(&ctrl->state_wq); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); @@ -3980,6 +4207,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); + + if (ctrl->admin_q) + blk_sync_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_sync_queues); @@ -4038,6 +4268,7 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_class); goto unregister_chrdev; } + nvme_class->dev_uevent = nvme_class_uevent; nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); if (IS_ERR(nvme_subsys_class)) { @@ -4062,7 +4293,6 @@ out: static void __exit nvme_core_exit(void) { - ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); |