diff options
68 files changed, 7340 insertions, 2832 deletions
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex index c06233fe52ac..8f85b0e41046 100644 --- a/Documentation/cdrom/cdrom-standard.tex +++ b/Documentation/cdrom/cdrom-standard.tex @@ -249,7 +249,6 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr unsigned\ long);\cr \noalign{\medskip} &const\ int& capability;& capability flags \cr - &int& n_minors;& number of active minor devices \cr \};\cr } $$ @@ -258,13 +257,7 @@ it should add a function pointer to this $struct$. When a particular function is not implemented, however, this $struct$ should contain a NULL instead. The $capability$ flags specify the capabilities of the \cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive -is registered with the \UCD. The value $n_minors$ should be a positive -value indicating the number of minor devices that are supported by -the low-level device driver, normally~1. Although these two variables -are `informative' rather than `operational,' they are included in -$cdrom_device_ops$ because they describe the capability of the {\em -driver\/} rather than the {\em drive}. Nomenclature has always been -difficult in computer programming. +is registered with the \UCD. Note that most functions have fewer parameters than their $blkdev_fops$ counterparts. This is because very little of the diff --git a/MAINTAINERS b/MAINTAINERS index 527d13759ecc..864e1fd31f0c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8612,10 +8612,10 @@ S: Maintained F: drivers/net/ethernet/netronome/ NETWORK BLOCK DEVICE (NBD) -M: Markus Pargmann <mpa@pengutronix.de> +M: Josef Bacik <jbacik@fb.com> S: Maintained +L: linux-block@vger.kernel.org L: nbd-general@lists.sourceforge.net -T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/uapi/linux/nbd.h @@ -11089,6 +11089,17 @@ L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-spear.c +SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER +M: Scott Bauer <scott.bauer@intel.com> +M: Jonathan Derrick <jonathan.derrick@intel.com> +M: Rafael Antognolli <rafael.antognolli@intel.com> +L: linux-block@vger.kernel.org +S: Supported +F: block/sed* +F: block/opal_proto.h +F: include/linux/sed* +F: include/uapi/linux/sed* + SECURITY SUBSYSTEM M: James Morris <james.l.morris@oracle.com> M: "Serge E. Hallyn" <serge@hallyn.com> diff --git a/block/Kconfig b/block/Kconfig index 8bf114a3858a..1aef809affae 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -147,6 +147,25 @@ config BLK_WBT_MQ Multiqueue currently doesn't have support for IO scheduling, enabling this option is recommended. +config BLK_DEBUG_FS + bool "Block layer debugging information in debugfs" + default y + depends on DEBUG_FS + ---help--- + Include block layer debugging information in debugfs. This information + is mostly useful for kernel developers, but it doesn't incur any cost + at runtime. + + Unless you are building a kernel for a tiny system, you should + say Y here. + +config BLK_SED_OPAL + bool "Logic for interfacing with Opal enabled SEDs" + ---help--- + Builds Logic for interfacing with Opal enabled controllers. + Enabling this option enables users to setup/unlock/lock + Locking ranges for SED devices using the Opal protocol. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9c4c48..0715ce93daef 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -63,6 +63,56 @@ config DEFAULT_IOSCHED default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP +config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y + ---help--- + MQ version of the deadline IO scheduler. + +config MQ_IOSCHED_NONE + bool + default y + +choice + prompt "Default single-queue blk-mq I/O scheduler" + default DEFAULT_SQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with a single queue. + + config DEFAULT_SQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_SQ_NONE + bool "None" + +endchoice + +config DEFAULT_SQ_IOSCHED + string + default "mq-deadline" if DEFAULT_SQ_DEADLINE + default "none" if DEFAULT_SQ_NONE + +choice + prompt "Default multi-queue blk-mq I/O scheduler" + default DEFAULT_MQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with multiple queues. + + config DEFAULT_MQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_MQ_NONE + bool "None" + +endchoice + +config DEFAULT_MQ_IOSCHED + string + default "mq-deadline" if DEFAULT_MQ_DEADLINE + default "none" if DEFAULT_MQ_NONE + endmenu endif diff --git a/block/Makefile b/block/Makefile index a827f988c4e6..6ba1b1bc9529 100644 --- a/block/Makefile +++ b/block/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ - blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ + blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o @@ -25,3 +26,5 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o +obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/bio.c b/block/bio.c index 2b375020fc49..d3c26d1cb1da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1403,7 +1403,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, bio_set_flag(bio, BIO_USER_MAPPED); /* - * subtle -- if __bio_map_user() ended up bouncing a bio, + * subtle -- if bio_map_user_iov() ended up bouncing a bio, * it would normally disappear when its bi_end_io is run. * however, we need it for the unmap, so grab an extra * reference to it @@ -1445,8 +1445,8 @@ static void __bio_unmap_user(struct bio *bio) * bio_unmap_user - unmap a bio * @bio: the bio being unmapped * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. * * bio_unmap_user() may sleep. */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8ba0af780e88..fb59a3edc778 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1223,7 +1223,10 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - blk_queue_bypass_start(q); + if (q->mq_ops) + blk_mq_freeze_queue(q); + else + blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); @@ -1261,7 +1264,10 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: - blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; @@ -1284,7 +1290,11 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - blk_queue_bypass_start(q); + if (q->mq_ops) + blk_mq_freeze_queue(q); + else + blk_queue_bypass_start(q); + spin_lock_irq(q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1304,7 +1314,11 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(q->queue_lock); - blk_queue_bypass_end(q); + + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); diff --git a/block/blk-core.c b/block/blk-core.c index 61ba08c58b64..b2df55a65250 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-wbt.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); @@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; + rq->internal_tag = -1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -525,12 +527,14 @@ void blk_set_queue_dying(struct request_queue *q) else { struct request_list *rl; + spin_lock_irq(q->queue_lock); blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { wake_up(&rl->wait[BLK_RW_SYNC]); wake_up(&rl->wait[BLK_RW_ASYNC]); } } + spin_unlock_irq(q->queue_lock); } } EXPORT_SYMBOL_GPL(blk_set_queue_dying); @@ -1033,29 +1037,13 @@ static bool blk_rq_should_init_elevator(struct bio *bio) * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. */ - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(bio->bi_opf)) return false; return true; } /** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - -/** * __get_request - get a free request * @rl: request list to allocate from * @op: operation and flags @@ -1655,7 +1643,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) { + if (op_is_flush(bio->bi_opf)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; @@ -1894,7 +1882,7 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { @@ -2143,7 +2131,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); return 0; } @@ -2159,7 +2147,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); @@ -3270,7 +3258,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) /* * rq is already accounted, so use raw insert */ - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); diff --git a/block/blk-exec.c b/block/blk-exec.c index 3ecb00a6cf45..ed1f10165268 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -9,6 +9,7 @@ #include <linux/sched/sysctl.h> #include "blk.h" +#include "blk-mq-sched.h" /* * for max sense size @@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(rq, at_head, true, false); + blk_mq_sched_insert_request(rq, at_head, true, false, false); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index 20b7c7a02f1c..4427896641ac 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -74,6 +74,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-mq-sched.h" /* FLUSH/FUA sequences */ enum { @@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) - blk_mq_run_hw_queue(hctx, true); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + + blk_mq_run_hw_queue(hctx, true); } /** @@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - if (q->mq_ops) { - blk_mq_insert_request(rq, false, true, false); - } else + if (q->mq_ops) + blk_mq_sched_insert_request(rq, false, true, false, false); + else list_add_tail(&rq->queuelist, &q->queue_head); return; } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 381cb50a673c..fe186a9eade9 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -43,8 +43,10 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->ops.elevator_exit_icq_fn) - et->ops.elevator_exit_icq_fn(icq); + if (et->uses_mq && et->ops.mq.exit_icq) + et->ops.mq.exit_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) + et->ops.sq.elevator_exit_icq_fn(icq); icq->flags |= ICQ_EXITED; } @@ -383,8 +385,10 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->ops.elevator_init_icq_fn) - et->ops.elevator_init_icq_fn(icq); + if (et->uses_mq && et->ops.mq.init_icq) + et->ops.mq.init_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) + et->ops.sq.elevator_init_icq_fn(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); diff --git a/block/blk-merge.c b/block/blk-merge.c index 182398cb1524..6aa43dec5af4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_rq_merge_fn) - if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next)) + if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) + if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) return 0; return attempt_merge(q, rq, next); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c new file mode 100644 index 000000000000..5cd2b435a9f5 --- /dev/null +++ b/block/blk-mq-debugfs.c @@ -0,0 +1,756 @@ +/* + * Copyright (C) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +struct blk_mq_debugfs_attr { + const char *name; + umode_t mode; + const struct file_operations *fops; +}; + +static struct dentry *block_debugfs_root; + +static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, ops); + if (!ret) { + m = file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static int hctx_state_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->state); + return 0; +} + +static int hctx_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_state_show, inode->i_private); +} + +static const struct file_operations hctx_state_fops = { + .open = hctx_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_flags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->flags); + return 0; +} + +static int hctx_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_flags_show, inode->i_private); +} + +static const struct file_operations hctx_flags_fops = { + .open = hctx_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) +{ + struct request *rq = list_entry_rq(v); + + seq_printf(m, "%p {.cmd_type=%u, .cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n", + rq, rq->cmd_type, rq->cmd_flags, (unsigned int)rq->rq_flags, + rq->tag, rq->internal_tag); + return 0; +} + +static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_lock(&hctx->lock); + return seq_list_start(&hctx->dispatch, *pos); +} + +static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + return seq_list_next(v, &hctx->dispatch, pos); +} + +static void hctx_dispatch_stop(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_unlock(&hctx->lock); +} + +static const struct seq_operations hctx_dispatch_seq_ops = { + .start = hctx_dispatch_start, + .next = hctx_dispatch_next, + .stop = hctx_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int hctx_dispatch_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops); +} + +static const struct file_operations hctx_dispatch_fops = { + .open = hctx_dispatch_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int hctx_ctx_map_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + sbitmap_bitmap_show(&hctx->ctx_map, m); + return 0; +} + +static int hctx_ctx_map_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_ctx_map_show, inode->i_private); +} + +static const struct file_operations hctx_ctx_map_fops = { + .open = hctx_ctx_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void blk_mq_debugfs_tags_show(struct seq_file *m, + struct blk_mq_tags *tags) +{ + seq_printf(m, "nr_tags=%u\n", tags->nr_tags); + seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); + seq_printf(m, "active_queues=%d\n", + atomic_read(&tags->active_queues)); + + seq_puts(m, "\nbitmap_tags:\n"); + sbitmap_queue_show(&tags->bitmap_tags, m); + + if (tags->nr_reserved_tags) { + seq_puts(m, "\nbreserved_tags:\n"); + sbitmap_queue_show(&tags->breserved_tags, m); + } +} + +static int hctx_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->tags) + blk_mq_debugfs_tags_show(m, hctx->tags); + mutex_unlock(&q->sysfs_lock); + + return 0; +} + +static int hctx_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_show, inode->i_private); +} + +static const struct file_operations hctx_tags_fops = { + .open = hctx_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->tags) + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + return 0; +} + +static int hctx_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_tags_bitmap_fops = { + .open = hctx_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_sched_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->sched_tags) + blk_mq_debugfs_tags_show(m, hctx->sched_tags); + mutex_unlock(&q->sysfs_lock); + + return 0; +} + +static int hctx_sched_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_fops = { + .open = hctx_sched_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->sched_tags) + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + return 0; +} + +static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_bitmap_fops = { + .open = hctx_sched_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_io_poll_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "considered=%lu\n", hctx->poll_considered); + seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); + seq_printf(m, "success=%lu\n", hctx->poll_success); + return 0; +} + +static int hctx_io_poll_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_io_poll_show, inode->i_private); +} + +static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + return count; +} + +static const struct file_operations hctx_io_poll_fops = { + .open = hctx_io_poll_open, + .read = seq_read, + .write = hctx_io_poll_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); +} + +static int hctx_stats_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_rq_stat stat[2]; + + blk_stat_init(&stat[BLK_STAT_READ]); + blk_stat_init(&stat[BLK_STAT_WRITE]); + + blk_hctx_stat_get(hctx, stat); + + seq_puts(m, "read: "); + print_stat(m, &stat[BLK_STAT_READ]); + seq_puts(m, "\n"); + + seq_puts(m, "write: "); + print_stat(m, &stat[BLK_STAT_WRITE]); + seq_puts(m, "\n"); + return 0; +} + +static int hctx_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_stats_show, inode->i_private); +} + +static ssize_t hctx_stats_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_mq_ctx *ctx; + int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + return count; +} + +static const struct file_operations hctx_stats_fops = { + .open = hctx_stats_open, + .read = seq_read, + .write = hctx_stats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); + + seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); + } + + seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); + return 0; +} + +static int hctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_dispatched_show, inode->i_private); +} + +static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) + hctx->dispatched[i] = 0; + return count; +} + +static const struct file_operations hctx_dispatched_fops = { + .open = hctx_dispatched_open, + .read = seq_read, + .write = hctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_queued_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->queued); + return 0; +} + +static int hctx_queued_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_queued_show, inode->i_private); +} + +static ssize_t hctx_queued_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->queued = 0; + return count; +} + +static const struct file_operations hctx_queued_fops = { + .open = hctx_queued_open, + .read = seq_read, + .write = hctx_queued_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_run_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->run); + return 0; +} + +static int hctx_run_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_run_show, inode->i_private); +} + +static ssize_t hctx_run_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->run = 0; + return count; +} + +static const struct file_operations hctx_run_fops = { + .open = hctx_run_open, + .read = seq_read, + .write = hctx_run_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_active_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + return 0; +} + +static int hctx_active_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_active_show, inode->i_private); +} + +static const struct file_operations hctx_active_fops = { + .open = hctx_active_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_lock(&ctx->lock); + return seq_list_start(&ctx->rq_list, *pos); +} + +static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_ctx *ctx = m->private; + + return seq_list_next(v, &ctx->rq_list, pos); +} + +static void ctx_rq_list_stop(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_unlock(&ctx->lock); +} + +static const struct seq_operations ctx_rq_list_seq_ops = { + .start = ctx_rq_list_start, + .next = ctx_rq_list_next, + .stop = ctx_rq_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int ctx_rq_list_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops); +} + +static const struct file_operations ctx_rq_list_fops = { + .open = ctx_rq_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int ctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); + return 0; +} + +static int ctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_dispatched_show, inode->i_private); +} + +static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; + return count; +} + +static const struct file_operations ctx_dispatched_fops = { + .open = ctx_dispatched_open, + .read = seq_read, + .write = ctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_merged_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu\n", ctx->rq_merged); + return 0; +} + +static int ctx_merged_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_merged_show, inode->i_private); +} + +static ssize_t ctx_merged_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_merged = 0; + return count; +} + +static const struct file_operations ctx_merged_fops = { + .open = ctx_merged_open, + .read = seq_read, + .write = ctx_merged_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_completed_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); + return 0; +} + +static int ctx_completed_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_completed_show, inode->i_private); +} + +static ssize_t ctx_completed_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_completed[0] = ctx->rq_completed[1] = 0; + return count; +} + +static const struct file_operations ctx_completed_fops = { + .open = ctx_completed_open, + .read = seq_read, + .write = ctx_completed_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { + {"state", 0400, &hctx_state_fops}, + {"flags", 0400, &hctx_flags_fops}, + {"dispatch", 0400, &hctx_dispatch_fops}, + {"ctx_map", 0400, &hctx_ctx_map_fops}, + {"tags", 0400, &hctx_tags_fops}, + {"tags_bitmap", 0400, &hctx_tags_bitmap_fops}, + {"sched_tags", 0400, &hctx_sched_tags_fops}, + {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, + {"io_poll", 0600, &hctx_io_poll_fops}, + {"stats", 0600, &hctx_stats_fops}, + {"dispatched", 0600, &hctx_dispatched_fops}, + {"queued", 0600, &hctx_queued_fops}, + {"run", 0600, &hctx_run_fops}, + {"active", 0400, &hctx_active_fops}, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { + {"rq_list", 0400, &ctx_rq_list_fops}, + {"dispatched", 0600, &ctx_dispatched_fops}, + {"merged", 0600, &ctx_merged_fops}, + {"completed", 0600, &ctx_completed_fops}, +}; + +int blk_mq_debugfs_register(struct request_queue *q, const char *name) +{ + if (!block_debugfs_root) + return -ENOENT; + + q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root); + if (!q->debugfs_dir) + goto err; + + if (blk_mq_debugfs_register_hctxs(q)) + goto err; + + return 0; + +err: + blk_mq_debugfs_unregister(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ + debugfs_remove_recursive(q->debugfs_dir); + q->mq_debugfs_dir = NULL; + q->debugfs_dir = NULL; +} + +static int blk_mq_debugfs_register_ctx(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct dentry *hctx_dir) +{ + struct dentry *ctx_dir; + char name[20]; + int i; + + snprintf(name, sizeof(name), "cpu%u", ctx->cpu); + ctx_dir = debugfs_create_dir(name, hctx_dir); + if (!ctx_dir) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_ctx_attrs); i++) { + const struct blk_mq_debugfs_attr *attr; + + attr = &blk_mq_debugfs_ctx_attrs[i]; + if (!debugfs_create_file(attr->name, attr->mode, ctx_dir, ctx, + attr->fops)) + return -ENOMEM; + } + + return 0; +} + +static int blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + struct dentry *hctx_dir; + char name[20]; + int i; + + snprintf(name, sizeof(name), "%u", hctx->queue_num); + hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir); + if (!hctx_dir) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_hctx_attrs); i++) { + const struct blk_mq_debugfs_attr *attr; + + attr = &blk_mq_debugfs_hctx_attrs[i]; + if (!debugfs_create_file(attr->name, attr->mode, hctx_dir, hctx, + attr->fops)) + return -ENOMEM; + } + + hctx_for_each_ctx(hctx, ctx, i) { + if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir)) + return -ENOMEM; + } + + return 0; +} + +int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + if (!q->debugfs_dir) + return -ENOENT; + + q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir); + if (!q->mq_debugfs_dir) + goto err; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_debugfs_register_hctx(q, hctx)) + goto err; + } + + return 0; + +err: + blk_mq_debugfs_unregister_hctxs(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ + debugfs_remove_recursive(q->mq_debugfs_dir); + q->mq_debugfs_dir = NULL; +} + +void blk_mq_debugfs_init(void) +{ + block_debugfs_root = debugfs_create_dir("block", NULL); +} diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c new file mode 100644 index 000000000000..114814ec3d49 --- /dev/null +++ b/block/blk-mq-sched.c @@ -0,0 +1,481 @@ +/* + * blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/blk-mq.h> + +#include <trace/events/block.h> + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" +#include "blk-mq-tag.h" +#include "blk-wbt.h" + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (exit && hctx->sched_data) + exit(hctx); + kfree(hctx->sched_data); + hctx->sched_data = NULL; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int ret; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); + if (!hctx->sched_data) { + ret = -ENOMEM; + goto error; + } + + if (init) { + ret = init(hctx); + if (ret) { + /* + * We don't want to give exit() a partially + * initialized sched_data. init() must clean up + * if it fails. + */ + kfree(hctx->sched_data); + hctx->sched_data = NULL; + goto error; + } + } + } + + return 0; +error: + blk_mq_sched_free_hctx_data(q, exit); + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); + +static void __blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(q->queue_lock); + + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return; + } + + rq->elv.icq = icq; + if (!blk_mq_sched_get_rq_priv(q, rq)) { + rq->rq_flags |= RQF_ELVPRIV; + get_io_context(icq->ioc); + return; + } + + rq->elv.icq = NULL; +} + +static void blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, struct bio *bio) +{ + struct io_context *ioc; + + ioc = rq_ioc(bio); + if (ioc) + __blk_mq_sched_assign_ioc(q, rq, ioc); +} + +struct request *blk_mq_sched_get_request(struct request_queue *q, + struct bio *bio, + unsigned int op, + struct blk_mq_alloc_data *data) +{ + struct elevator_queue *e = q->elevator; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + + blk_queue_enter_live(q); + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); + + if (e) { + data->flags |= BLK_MQ_REQ_INTERNAL; + + /* + * Flush requests are special and go directly to the + * dispatch list. + */ + if (!op_is_flush(op) && e->type->ops.mq.get_request) { + rq = e->type->ops.mq.get_request(q, op, data); + if (rq) + rq->rq_flags |= RQF_QUEUED; + } else + rq = __blk_mq_alloc_request(data, op); + } else { + rq = __blk_mq_alloc_request(data, op); + if (rq) + data->hctx->tags->rqs[rq->tag] = rq; + } + + if (rq) { + if (!op_is_flush(op)) { + rq->elv.icq = NULL; + if (e && e->type->icq_cache) + blk_mq_sched_assign_ioc(q, rq, bio); + } + data->hctx->queued++; + return rq; + } + + blk_queue_exit(q); + return NULL; +} + +void blk_mq_sched_put_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (rq->rq_flags & RQF_ELVPRIV) { + blk_mq_sched_put_rq_priv(rq->q, rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) + e->type->ops.mq.put_request(rq); + else + blk_mq_finish_request(rq); +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + LIST_HEAD(rq_list); + + if (unlikely(blk_mq_hctx_stopped(hctx))) + return; + + hctx->run++; + + /* + * If we have previous entries on our dispatch list, grab them first for + * more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Only ask the scheduler for requests, if we didn't have residual + * requests from the dispatch list. This is to avoid the case where + * we only ever dispatch a fraction of the requests available because + * of low device queue depth. Once we pull requests out of the IO + * scheduler, we can no longer merge or sort them. So it's best to + * leave them there for as long as we can. Mark the hw queue as + * needing a restart in that case. + */ + if (!list_empty(&rq_list)) { + blk_mq_sched_mark_restart(hctx); + blk_mq_dispatch_rq_list(hctx, &rq_list); + } else if (!e || !e->type->ops.mq.dispatch_request) { + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list); + } else { + do { + struct request *rq; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) + break; + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); + } +} + +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)) +{ + do { + struct request *rq; + + rq = get_rq(hctx); + if (!rq) + break; + + list_add_tail(&rq->queuelist, rq_list); + } while (1); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio) +{ + struct request *rq; + int ret; + + ret = elv_merge(q, &rq, bio); + if (ret == ELEVATOR_BACK_MERGE) { + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_back_merge(q, rq, bio)) { + if (!attempt_back_merge(q, rq)) + elv_merged_request(q, rq, ret); + return true; + } + } else if (ret == ELEVATOR_FRONT_MERGE) { + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_front_merge(q, rq, bio)) { + if (!attempt_front_merge(q, rq)) + elv_merged_request(q, rq, ret); + return true; + } + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); + +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.mq.bio_merge) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_put_ctx(ctx); + return e->type->ops.mq.bio_merge(hctx, bio); + } + + return false; +} + +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +{ + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); + +void blk_mq_sched_request_inserted(struct request *rq) +{ + trace_block_rq_insert(rq->q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); + +bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + if (rq->tag == -1) { + rq->rq_flags |= RQF_SORTED; + return false; + } + + /* + * If we already have a real request tag, send directly to + * the dispatch list. + */ + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); + +static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (blk_mq_hctx_has_pending(hctx)) + blk_mq_run_hw_queue(hctx, true); + } +} + +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + blk_mq_sched_restart_hctx(hctx); + else { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + return; + + clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_restart_hctx(hctx); + } +} + +/* + * Add flush/fua to the queue. If we fail getting a driver tag, then + * punt to the requeue list. Requeue will re-invoke us from a context + * that's safe to block from. + */ +static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool can_block) +{ + if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { + blk_insert_flush(rq); + blk_mq_run_hw_queue(hctx, true); + } else + blk_mq_add_to_requeue_list(rq, true, true); +} + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { + blk_mq_sched_insert_flush(hctx, rq, can_block); + return; + } + + if (e && e->type->ops.mq.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.mq.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.insert_requests) + e->type->ops.mq.insert_requests(hctx, list, false); + else + blk_mq_insert_requests(hctx, ctx, list); + + blk_mq_run_hw_queue(hctx, run_queue_async); +} + +static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) +{ + if (hctx->sched_tags) { + blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; + } +} + +int blk_mq_sched_setup(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Default to 256, since we don't split into sync/async like the + * old code did. Additionally, this is a per-hw queue depth. + */ + q->nr_requests = 2 * BLKDEV_MAX_RQ; + + /* + * We're switching to using an IO scheduler, so setup the hctx + * scheduler tags and switch the request map from the regular + * tags to scheduler tags. First allocate what we need, so we + * can safely fail and fallback, if needed. + */ + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); + if (!hctx->sched_tags) { + ret = -ENOMEM; + break; + } + ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); + if (ret) + break; + } + + /* + * If we failed, free what we did allocate + */ + if (ret) { + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + blk_mq_sched_free_tags(set, hctx, i); + } + + return ret; + } + + return 0; +} + +void blk_mq_sched_teardown(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_free_tags(set, hctx, i); +} + +int blk_mq_sched_init(struct request_queue *q) +{ + int ret; + +#if defined(CONFIG_DEFAULT_SQ_NONE) + if (q->nr_hw_queues == 1) + return 0; +#endif +#if defined(CONFIG_DEFAULT_MQ_NONE) + if (q->nr_hw_queues > 1) + return 0; +#endif + + mutex_lock(&q->sysfs_lock); + ret = elevator_init(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return ret; +} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h new file mode 100644 index 000000000000..9478aaeb48c5 --- /dev/null +++ b/block/blk-mq-sched.h @@ -0,0 +1,142 @@ +#ifndef BLK_MQ_SCHED_H +#define BLK_MQ_SCHED_H + +#include "blk-mq.h" +#include "blk-mq-tag.h" + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)); + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)); + +struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); +void blk_mq_sched_put_request(struct request *rq); + +void blk_mq_sched_request_inserted(struct request *rq); +bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq); +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio); +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block); +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async); + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)); + +int blk_mq_sched_setup(struct request_queue *q); +void blk_mq_sched_teardown(struct request_queue *q); + +int blk_mq_sched_init(struct request_queue *q); + +static inline bool +blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) + return false; + + return __blk_mq_sched_bio_merge(q, bio); +} + +static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, + struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.get_rq_priv) + return e->type->ops.mq.get_rq_priv(q, rq); + + return 0; +} + +static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, + struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.put_rq_priv) + e->type->ops.mq.put_rq_priv(q, rq); +} + +static inline bool +blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + + return true; +} + +static inline void +blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.completed_request) + e->type->ops.mq.completed_request(hctx, rq); + + BUG_ON(rq->internal_tag == -1); + + blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); +} + +static inline void blk_mq_sched_started_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.started_request) + e->type->ops.mq.started_request(rq); +} + +static inline void blk_mq_sched_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.requeue_request) + e->type->ops.mq.requeue_request(rq); +} + +static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.has_work) + return e->type->ops.mq.has_work(hctx); + + return false; +} + +static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + } + } +} + +static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +#endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index eacd3af72099..308b3f4fc310 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -122,123 +122,16 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, return res; } -static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], - ctx->rq_dispatched[0]); -} - -static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu\n", ctx->rq_merged); -} - -static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], - ctx->rq_completed[0]); -} - -static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) -{ - struct request *rq; - int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg); - - list_for_each_entry(rq, list, queuelist) { - const int rq_len = 2 * sizeof(rq) + 2; - - /* if the output will be truncated */ - if (PAGE_SIZE - 1 < len + rq_len) { - /* backspacing if it can't hold '\t...\n' */ - if (PAGE_SIZE - 1 < len + 5) - len -= rq_len; - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t...\n"); - break; - } - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t%p\n", rq); - } - - return len; -} - -static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) -{ - ssize_t ret; - - spin_lock(&ctx->lock); - ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); - spin_unlock(&ctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", - hctx->poll_considered, hctx->poll_invoked, - hctx->poll_success); -} - -static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t size) -{ - hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; - - return size; -} - -static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - return sprintf(page, "%lu\n", hctx->queued); -} - -static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "%lu\n", hctx->run); -} - -static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - char *start_page = page; - int i; - - page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { - unsigned int d = 1U << (i - 1); - - page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); - } - - page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), - hctx->dispatched[i]); - return page - start_page; -} - -static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, +static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); - spin_unlock(&hctx->lock); - - return ret; + return sprintf(page, "%u\n", hctx->tags->nr_tags); } -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) { - return blk_mq_tag_sysfs_show(hctx->tags, page); -} - -static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); + return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) @@ -259,121 +152,27 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } -static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) -{ - struct blk_mq_ctx *ctx; - unsigned int i; - - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } -} - -static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t count) -{ - blk_mq_stat_clear(hctx); - return count; -} - -static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -{ - return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", - pre, (long long) stat->nr_samples, - (long long) stat->mean, (long long) stat->min, - (long long) stat->max); -} - -static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - struct blk_rq_stat stat[2]; - ssize_t ret; - - blk_stat_init(&stat[BLK_STAT_READ]); - blk_stat_init(&stat[BLK_STAT_WRITE]); - - blk_hctx_stat_get(hctx, stat); - - ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); - ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); - return ret; -} - -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_sysfs_dispatched_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { - .attr = {.name = "merged", .mode = S_IRUGO }, - .show = blk_mq_sysfs_merged_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { - .attr = {.name = "completed", .mode = S_IRUGO }, - .show = blk_mq_sysfs_completed_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { - .attr = {.name = "rq_list", .mode = S_IRUGO }, - .show = blk_mq_sysfs_rq_list_show, -}; - static struct attribute *default_ctx_attrs[] = { - &blk_mq_sysfs_dispatched.attr, - &blk_mq_sysfs_merged.attr, - &blk_mq_sysfs_completed.attr, - &blk_mq_sysfs_rq_list.attr, NULL, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { - .attr = {.name = "queued", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_queued_show, +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { + .attr = {.name = "nr_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { - .attr = {.name = "run", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_run_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_dispatched_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { - .attr = {.name = "active", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_active_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { - .attr = {.name = "pending", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_rq_list_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { - .attr = {.name = "tags", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_tags_show, +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { + .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, - .show = blk_mq_hw_sysfs_poll_show, - .store = blk_mq_hw_sysfs_poll_store, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { - .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, - .show = blk_mq_hw_sysfs_stat_show, - .store = blk_mq_hw_sysfs_stat_store, -}; static struct attribute *default_hw_ctx_attrs[] = { - &blk_mq_hw_sysfs_queued.attr, - &blk_mq_hw_sysfs_run.attr, - &blk_mq_hw_sysfs_dispatched.attr, - &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_tags.attr, + &blk_mq_hw_sysfs_nr_tags.attr, + &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, - &blk_mq_hw_sysfs_active.attr, - &blk_mq_hw_sysfs_poll.attr, - &blk_mq_hw_sysfs_stat.attr, NULL, }; @@ -455,6 +254,8 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) kobject_put(&hctx->kobj); } + blk_mq_debugfs_unregister(q); + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); @@ -504,6 +305,8 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) kobject_uevent(&q->mq_kobj, KOBJ_ADD); + blk_mq_debugfs_register(q, kobject_name(&dev->kobj)); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) @@ -529,6 +332,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q) if (!q->mq_sysfs_init_done) return; + blk_mq_debugfs_unregister_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -541,6 +346,8 @@ int blk_mq_sysfs_register(struct request_queue *q) if (!q->mq_sysfs_init_done) return ret; + blk_mq_debugfs_register_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dcf5ce3ba4bf..54c84363c1b2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -90,113 +90,97 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) +static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, + struct sbitmap_queue *bt) { - if (!hctx_may_queue(hctx, bt)) + if (!(data->flags & BLK_MQ_REQ_INTERNAL) && + !hctx_may_queue(data->hctx, bt)) return -1; return __sbitmap_queue_get(bt); } -static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, - struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_WAIT(wait); + unsigned int tag_offset; + bool drop_ctx; int tag; - tag = __bt_get(hctx, bt); + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + bt = &tags->breserved_tags; + tag_offset = 0; + } else { + bt = &tags->bitmap_tags; + tag_offset = tags->nr_reserved_tags; + } + + tag = __blk_mq_get_tag(data, bt); if (tag != -1) - return tag; + goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) - return -1; + return BLK_MQ_TAG_FAIL; - ws = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, data->hctx); + drop_ctx = data->ctx == NULL; do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for - * some to complete. Note that hctx can be NULL here for - * reserved tag allocation. + * some to complete. */ - if (hctx) - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; - blk_mq_put_ctx(data->ctx); + if (data->ctx) + blk_mq_put_ctx(data->ctx); io_schedule(); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); - if (data->flags & BLK_MQ_REQ_RESERVED) { - bt = &data->hctx->tags->breserved_tags; - } else { - hctx = data->hctx; - bt = &hctx->tags->bitmap_tags; - } + tags = blk_mq_tags_from_data(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + bt = &tags->breserved_tags; + else + bt = &tags->bitmap_tags; + finish_wait(&ws->wait, &wait); - ws = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, data->hctx); } while (1); - finish_wait(&ws->wait, &wait); - return tag; -} - -static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) -{ - int tag; - - tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - data->hctx->tags); - if (tag >= 0) - return tag + data->hctx->tags->nr_reserved_tags; - - return BLK_MQ_TAG_FAIL; -} - -static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) -{ - int tag; - - if (unlikely(!data->hctx->tags->nr_reserved_tags)) { - WARN_ON_ONCE(1); - return BLK_MQ_TAG_FAIL; - } - - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, - data->hctx->tags); - if (tag < 0) - return BLK_MQ_TAG_FAIL; + if (drop_ctx && data->ctx) + blk_mq_put_ctx(data->ctx); - return tag; -} + finish_wait(&ws->wait, &wait); -unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) -{ - if (data->flags & BLK_MQ_REQ_RESERVED) - return __blk_mq_get_reserved_tag(data); - return __blk_mq_get_tag(data); +found_tag: + return tag + tag_offset; } -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - unsigned int tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag) { - struct blk_mq_tags *tags = hctx->tags; - if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; @@ -312,11 +296,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) struct blk_mq_tags *tags = set->tags[i]; for (j = 0; j < tags->nr_tags; j++) { - if (!tags->rqs[j]) + if (!tags->static_rqs[j]) continue; ret = set->ops->reinit_request(set->driver_data, - tags->rqs[j]); + tags->static_rqs[j]); if (ret) goto out; } @@ -351,11 +335,6 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, } -static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) -{ - return bt->sb.depth - sbitmap_weight(&bt->sb); -} - static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { @@ -411,19 +390,56 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) kfree(tags); } -int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tagsptr, unsigned int tdepth, + bool can_grow) { - tdepth -= tags->nr_reserved_tags; - if (tdepth > tags->nr_tags) + struct blk_mq_tags *tags = *tagsptr; + + if (tdepth <= tags->nr_reserved_tags) return -EINVAL; + tdepth -= tags->nr_reserved_tags; + /* - * Don't need (or can't) update reserved tags here, they remain - * static and should never need resizing. + * If we are allowed to grow beyond the original size, allocate + * a new set of tags before freeing the old one. */ - sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + if (tdepth > tags->nr_tags) { + struct blk_mq_tag_set *set = hctx->queue->tag_set; + struct blk_mq_tags *new; + bool ret; + + if (!can_grow) + return -EINVAL; + + /* + * We need some sort of upper limit, set it high enough that + * no valid use cases should require more. + */ + if (tdepth > 16 * BLKDEV_MAX_RQ) + return -EINVAL; + + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); + if (!new) + return -ENOMEM; + ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); + if (ret) { + blk_mq_free_rq_map(new); + return -ENOMEM; + } + + blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); + blk_mq_free_rq_map(*tagsptr); + *tagsptr = new; + } else { + /* + * Don't need (or can't) update reserved tags here, they + * remain static and should never need resizing. + */ + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + } - blk_mq_tag_wakeup_all(tags, false); return 0; } @@ -454,25 +470,3 @@ u32 blk_mq_unique_tag(struct request *rq) (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); - -ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) -{ - char *orig_page = page; - unsigned int free, res; - - if (!tags) - return 0; - - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " - "bits_per_word=%u\n", - tags->nr_tags, tags->nr_reserved_tags, - 1U << tags->bitmap_tags.sb.shift); - - free = bt_unused_tags(&tags->bitmap_tags); - res = bt_unused_tags(&tags->breserved_tags); - - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); - page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); - - return page - orig_page; -} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d1662734dc53..63497423c5cd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -16,6 +16,7 @@ struct blk_mq_tags { struct sbitmap_queue breserved_tags; struct request **rqs; + struct request **static_rqs; struct list_head page_list; }; @@ -24,11 +25,12 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - unsigned int tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); -extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); +extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, + unsigned int depth, bool can_grow); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); diff --git a/block/blk-mq.c b/block/blk-mq.c index c3400b5444a7..489076e7ae15 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -32,6 +32,7 @@ #include "blk-mq-tag.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-mq-sched.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -39,9 +40,11 @@ static LIST_HEAD(all_q_list); /* * Check if any of the ctx's have pending work in this hardware queue */ -static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map); + return sbitmap_any_bit_set(&hctx->ctx_map) || + !list_empty_careful(&hctx->dispatch) || + blk_mq_sched_has_work(hctx); } /* @@ -167,8 +170,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op) +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ @@ -213,53 +216,58 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, ctx->rq_dispatched[op_is_sync(op)]++; } +EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op) { struct request *rq; unsigned int tag; tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = data->hctx->tags->rqs[tag]; + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); - if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + rq = tags->static_rqs[tag]; + + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + rq->tag = tag; + rq->internal_tag = -1; } - rq->tag = tag; blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } return NULL; } +EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) { - struct blk_mq_ctx *ctx; - struct blk_mq_hw_ctx *hctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; int ret; ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - blk_mq_put_ctx(ctx); + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); - if (!rq) { - blk_queue_exit(q); + blk_mq_put_ctx(alloc_data.ctx); + blk_queue_exit(q); + + if (!rq) return ERR_PTR(-EWOULDBLOCK); - } rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -319,10 +327,10 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct request *rq) +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq) { - const int tag = rq->tag; + const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; if (rq->rq_flags & RQF_MQ_INFLIGHT) @@ -333,23 +341,31 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); - blk_mq_put_tag(hctx, ctx, tag); + if (rq->tag != -1) + blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + if (sched_tag != -1) + blk_mq_sched_completed_request(hctx, rq); + blk_mq_sched_restart_queues(hctx); blk_queue_exit(q); } -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); +} +void blk_mq_finish_request(struct request *rq) +{ + blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } -EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); + blk_mq_sched_put_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -467,6 +483,8 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_sched_started_request(rq); + trace_block_rq_issue(q, rq); rq->resid_len = blk_rq_bytes(rq); @@ -515,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); + blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -549,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false, true); } blk_mq_run_hw_queues(q, false); @@ -639,7 +658,7 @@ struct blk_mq_timeout_data { void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_ops *ops = req->q->mq_ops; + const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* @@ -763,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q, continue; el_ret = blk_try_merge(rq, bio); + if (el_ret == ELEVATOR_NO_MERGE) + continue; + + if (!blk_mq_sched_allow_merge(q, rq, bio)) + break; + if (el_ret == ELEVATOR_BACK_MERGE) { if (bio_attempt_back_merge(q, rq, bio)) { ctx->rq_merged++; @@ -803,7 +828,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) * Process software queues that have been marked busy, splicing them * to the for-dispatch */ -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, @@ -812,6 +837,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } +EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); static inline unsigned int queued_to_index(unsigned int queued) { @@ -821,6 +847,74 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait) +{ + struct blk_mq_alloc_data data = { + .q = rq->q, + .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + }; + + if (rq->tag != -1) { +done: + if (hctx) + *hctx = data.hctx; + return true; + } + + rq->tag = blk_mq_get_tag(&data); + if (rq->tag >= 0) { + if (blk_mq_tag_busy(data.hctx)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&data.hctx->nr_active); + } + data.hctx->tags->rqs[rq->tag] = rq; + goto done; + } + + return false; +} + +static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +/* + * If we fail getting a driver tag because all the driver tags are already + * assigned and on the dispatch list, BUT the first entry does not have a + * tag, then we could deadlock. For that case, move entries with assigned + * driver tags to the front, leaving the set of tagged requests in the + * same order, and the untagged set in the same order. + */ +static bool reorder_tags_to_front(struct list_head *list) +{ + struct request *rq, *tmp, *first = NULL; + + list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { + if (rq == first) + break; + if (rq->tag != -1) { + list_move(&rq->queuelist, list); + if (!first) + first = rq; + } + } + + return first != NULL; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -843,6 +937,20 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!queued && reorder_tags_to_front(list)) + continue; + + /* + * We failed getting a driver tag. Mark the queue(s) + * as needing a restart. Retry getting a tag again, + * in case the needed IO completed right before we + * marked the queue as needing a restart. + */ + blk_mq_sched_mark_restart(hctx); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; + } list_del_init(&rq->queuelist); bd.rq = rq; @@ -855,6 +963,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: + blk_mq_put_driver_tag(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -885,7 +994,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) */ if (!list_empty(list)) { spin_lock(&hctx->lock); - list_splice(list, &hctx->dispatch); + list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* @@ -896,47 +1005,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * the requests in rq_list might get lost. * * blk_mq_run_hw_queue() already checks the STOPPED bit - **/ - blk_mq_run_hw_queue(hctx, true); + * + * If RESTART is set, then let completion restart the queue + * instead of potentially looping here. + */ + if (!blk_mq_sched_needs_restart(hctx)) + blk_mq_run_hw_queue(hctx, true); } return ret != BLK_MQ_RQ_QUEUE_BUSY; } -/* - * Run this hardware queue, pulling any software queues mapped to it in. - * Note that this function currently has various problems around ordering - * of IO. In particular, we'd like FIFO behaviour on handling existing - * items on the hctx->dispatch list. Ignore that for now. - */ -static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) -{ - LIST_HEAD(rq_list); - - if (unlikely(blk_mq_hctx_stopped(hctx))) - return; - - hctx->run++; - - /* - * Touch any software queue that has pending entries. - */ - flush_busy_ctxs(hctx, &rq_list); - - /* - * If we have previous entries on our dispatch list, grab them - * and stuff them at the front for more fair dispatch. - */ - if (!list_empty_careful(&hctx->dispatch)) { - spin_lock(&hctx->lock); - if (!list_empty(&hctx->dispatch)) - list_splice_init(&hctx->dispatch, &rq_list); - spin_unlock(&hctx->lock); - } - - blk_mq_dispatch_rq_list(hctx, &rq_list); -} - static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; @@ -946,11 +1025,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } } @@ -1006,8 +1085,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if ((!blk_mq_hctx_has_pending(hctx) && - list_empty_careful(&hctx->dispatch)) || + if (!blk_mq_hctx_has_pending(hctx) || blk_mq_hctx_stopped(hctx)) continue; @@ -1116,6 +1194,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) if (unlikely(!blk_mq_hw_queue_mapped(hctx))) return; + blk_mq_stop_hw_queue(hctx); kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } @@ -1135,8 +1214,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); } -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; @@ -1144,32 +1223,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, blk_mq_hctx_mark_pending(hctx, ctx); } -void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static void blk_mq_insert_requests(struct request_queue *q, - struct blk_mq_ctx *ctx, - struct list_head *list, - int depth, - bool from_schedule) +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list) { - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - trace_block_unplug(q, depth, !from_schedule); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1185,8 +1242,6 @@ static void blk_mq_insert_requests(struct request_queue *q, } blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, from_schedule); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1222,9 +1277,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, - &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, + &ctx_list, + from_schedule); } this_ctx = rq->mq_ctx; @@ -1241,8 +1297,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + from_schedule); } } @@ -1280,46 +1337,39 @@ insert_rq: } spin_unlock(&ctx->lock); - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); return true; } } -static struct request *blk_mq_map_request(struct request_queue *q, - struct bio *bio, - struct blk_mq_alloc_data *data) +static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - struct request *rq; - - blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); + if (rq->tag != -1) + return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - trace_block_getrq(q, bio, bio->bi_opf); - blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, bio->bi_opf); - - data->hctx->queued++; - return rq; + return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { - int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, .last = 1 }; - blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + struct blk_mq_hw_ctx *hctx; + blk_qc_t new_cookie; + int ret; - if (blk_mq_hctx_stopped(hctx)) + if (q->elevator) goto insert; + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + goto insert; + + new_cookie = request_to_qc_t(hctx, rq); + /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously @@ -1341,7 +1391,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, true, false); } /* @@ -1352,8 +1402,8 @@ insert: static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); - struct blk_mq_alloc_data data; + const int is_flush_fua = op_is_flush(bio->bi_opf); + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; @@ -1374,9 +1424,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1384,12 +1439,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); + blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } plug = current->plug; @@ -1438,6 +1496,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) goto done; } + if (q->elevator) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1445,7 +1510,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); @@ -1460,10 +1524,10 @@ done: static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_plug *plug; unsigned int request_count = 0; - struct blk_mq_alloc_data data; + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; blk_qc_t cookie; unsigned int wb_acct; @@ -1483,9 +1547,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1493,12 +1562,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); + blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } /* @@ -1535,6 +1607,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) return cookie; } + if (q->elevator) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1542,16 +1621,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); +done: return cookie; } -static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) { struct page *page; @@ -1559,11 +1638,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, int i; for (i = 0; i < tags->nr_tags; i++) { - if (!tags->rqs[i]) + struct request *rq = tags->static_rqs[i]; + + if (!rq) continue; - set->ops->exit_request(set->driver_data, tags->rqs[i], + set->ops->exit_request(set->driver_data, rq, hctx_idx, i); - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; } } @@ -1577,33 +1658,32 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, kmemleak_free(page_address(page)); __free_pages(page, page->private); } +} +void blk_mq_free_rq_map(struct blk_mq_tags *tags) +{ kfree(tags->rqs); + tags->rqs = NULL; + kfree(tags->static_rqs); + tags->static_rqs = NULL; blk_mq_free_tags(tags); } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - -static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; - tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->numa_node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; - INIT_LIST_HEAD(&tags->page_list); - - tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, set->numa_node); if (!tags->rqs) { @@ -1611,15 +1691,40 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return NULL; } + tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); + if (!tags->static_rqs) { + kfree(tags->rqs); + blk_mq_free_tags(tags); + return NULL; + } + + return tags; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + INIT_LIST_HEAD(&tags->page_list); + /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); - left = rq_size * set->queue_depth; + left = rq_size * depth; - for (i = 0; i < set->queue_depth; ) { + for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; @@ -1653,15 +1758,17 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, set->queue_depth - i); + to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - tags->rqs[i] = p; + struct request *rq = p; + + tags->static_rqs[i] = rq; if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, - tags->rqs[i], hctx_idx, i, + rq, hctx_idx, i, set->numa_node)) { - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; goto fail; } } @@ -1670,11 +1777,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; + return 0; fail: - blk_mq_free_rq_map(set, tags, hctx_idx); - return NULL; + blk_mq_free_rqs(set, tags, hctx_idx); + return -ENOMEM; } /* @@ -1866,6 +1973,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } +static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +{ + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags); + if (!set->tags[hctx_idx]) + return false; + + ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, + set->queue_depth); + if (!ret) + return true; + + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + return false; +} + +static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (set->tags[hctx_idx]) { + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + } +} + static void blk_mq_map_swqueue(struct request_queue *q, const struct cpumask *online_mask) { @@ -1894,17 +2030,15 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx]) { - set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx); - + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_rq_map(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - if (!set->tags[hctx_idx]) - q->mq_map[i] = 0; + q->mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); @@ -1927,10 +2061,9 @@ static void blk_mq_map_swqueue(struct request_queue *q, * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) { - blk_mq_free_rq_map(set, set->tags[i], i); - set->tags[i] = NULL; - } + if (i && set->tags[i]) + blk_mq_free_map_and_requests(set, i); + hctx->tags = NULL; continue; } @@ -2023,6 +2156,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned int i; + blk_mq_sched_teardown(q); + /* hctx kobj stays in hctx */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx) @@ -2097,10 +2232,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) { - blk_mq_free_rq_map(set, hctx->tags, j); - set->tags[j] = NULL; - } + if (hctx->tags) + blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); free_cpumask_var(hctx->cpumask); kobject_put(&hctx->kobj); @@ -2181,6 +2314,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, mutex_unlock(&all_q_mutex); put_online_cpus(); + if (!(set->flags & BLK_MQ_F_NO_SCHED)) { + int ret; + + ret = blk_mq_sched_init(q); + if (ret) + return ERR_PTR(ret); + } + return q; err_hctxs: @@ -2279,10 +2420,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu) * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list * and set bit0 in pending bitmap as ctx1->index_hw is still zero. * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in - * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. - * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list - * is ignored. + * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set + * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is + * ignored. */ static int blk_mq_queue_reinit_prepare(unsigned int cpu) { @@ -2296,17 +2437,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) + for (i = 0; i < set->nr_hw_queues; i++) + if (!__blk_mq_alloc_rq_map(set, i)) goto out_unwind; - } return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); + blk_mq_free_rq_map(set->tags[i]); return -ENOMEM; } @@ -2430,10 +2569,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < nr_cpu_ids; i++) { - if (set->tags[i]) - blk_mq_free_rq_map(set, set->tags[i], i); - } + for (i = 0; i < nr_cpu_ids; i++) + blk_mq_free_map_and_requests(set, i); kfree(set->mq_map); set->mq_map = NULL; @@ -2449,14 +2586,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_hw_ctx *hctx; int i, ret; - if (!set || nr > set->queue_depth) + if (!set) return -EINVAL; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - ret = blk_mq_tag_update_depth(hctx->tags, nr); + /* + * If we're using an MQ scheduler, just update the scheduler + * queue depth. This is similar to what the old code would do. + */ + if (!hctx->sched_tags) { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, + min(nr, set->queue_depth), + false); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr, true); + } if (ret) break; } @@ -2464,6 +2615,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + return ret; } @@ -2649,7 +2803,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) blk_flush_plug_list(plug, false); hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); return __blk_mq_poll(hctx, rq); } @@ -2667,6 +2824,8 @@ void blk_mq_enable_hotplug(void) static int __init blk_mq_init(void) { + blk_mq_debugfs_init(); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); diff --git a/block/blk-mq.h b/block/blk-mq.h index 63e9116cddbd..b52abd62b1b0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -32,8 +32,32 @@ void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait); /* + * Internal helpers for allocating/freeing the request map + */ +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx); +void blk_mq_free_rq_map(struct blk_mq_tags *tags); +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags); +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth); + +/* + * Internal helpers for request insertion into sw queues + */ +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head); +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list); +/* * CPU hotplug helpers */ void blk_mq_enable_hotplug(void); @@ -57,6 +81,40 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +/* + * debugfs helpers + */ +#ifdef CONFIG_BLK_DEBUG_FS +void blk_mq_debugfs_init(void); +int blk_mq_debugfs_register(struct request_queue *q, const char *name); +void blk_mq_debugfs_unregister(struct request_queue *q); +int blk_mq_debugfs_register_hctxs(struct request_queue *q); +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); +#else +static inline void blk_mq_debugfs_init(void) +{ +} + +static inline int blk_mq_debugfs_register(struct request_queue *q, + const char *name) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister(struct request_queue *q) +{ +} + +static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ +} +#endif + extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); @@ -103,6 +161,25 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) +{ + if (data->flags & BLK_MQ_REQ_INTERNAL) + return data->hctx->sched_tags; + + return data->hctx->tags; +} + +/* + * Internal helpers for request allocation/init/free + */ +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op); +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq); +void blk_mq_finish_request(struct request *rq); +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op); + static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_STOPPED, &hctx->state); diff --git a/block/blk-tag.c b/block/blk-tag.c index bae1decb6ec3..07cc329fa4b0 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) list_del_init(&rq->queuelist); rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; + rq->internal_tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) printk(KERN_ERR "%s: tag %d is missing\n", diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a6bb4fe326c3..82fd0cc394eb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg) unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; - if ((bio = throtl_peek_queued(&sq->queued[READ]))) + bio = throtl_peek_queued(&sq->queued[READ]); + if (bio) tg_may_dispatch(tg, bio, &read_wait); - if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) + bio = throtl_peek_queued(&sq->queued[WRITE]); + if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); diff --git a/block/blk.h b/block/blk.h index 041185e5f129..9a716b5925a4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) return NULL; } if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) return NULL; } } @@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_activate_req_fn) - e->type->ops.elevator_activate_req_fn(q, rq); + if (e->type->ops.sq.elevator_activate_req_fn) + e->type->ops.sq.elevator_activate_req_fn(q, rq); } static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_deactivate_req_fn) - e->type->ops.elevator_deactivate_req_fn(q, rq); + if (e->type->ops.sq.elevator_deactivate_req_fn) + e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -264,6 +264,22 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static inline struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + +/** * create_io_context - try to create task->io_context * @gfp_mask: allocation mask * @node: allocation node diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c73a6fcaeb9d..f0f29ee731e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2749,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) if (!cfqg) return NULL; - for_each_cfqg_st(cfqg, i, j, st) - if ((cfqq = cfq_rb_first(st)) != NULL) + for_each_cfqg_st(cfqg, i, j, st) { + cfqq = cfq_rb_first(st); + if (cfqq) return cfqq; + } return NULL; } @@ -3864,6 +3866,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } + /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ + cfqq->ioprio_class = IOPRIO_CLASS_NONE; cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); @@ -4837,7 +4841,7 @@ static struct elv_fs_entry cfq_attrs[] = { }; static struct elevator_type iosched_cfq = { - .ops = { + .ops.sq = { .elevator_merge_fn = cfq_merge, .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 55e0bb6d7da7..05fc0ea25a98 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = { }; static struct elevator_type iosched_deadline = { - .ops = { + .ops.sq = { .elevator_merge_fn = deadline_merge, .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, diff --git a/block/elevator.c b/block/elevator.c index 40f0c04e5ad3..b2a55167f0c2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -40,6 +40,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-mq-sched.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -58,8 +59,10 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_bio_merge_fn) - return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio); + if (e->uses_mq && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) + return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); return 1; } @@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->uses_mq = e->uses_mq; return eq; } @@ -203,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name) } /* - * Use the default elevator specified by config boot param or - * config option. Don't try to load modules as we could be running - * off async and request_module() isn't allowed from async. + * Use the default elevator specified by config boot param for + * non-mq devices, or by config option. Don't try to load modules + * as we could be running off async and request_module() isn't + * allowed from async. */ - if (!e && *chosen_elevator) { + if (!e && !q->mq_ops && *chosen_elevator) { e = elevator_get(chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", @@ -215,18 +220,32 @@ int elevator_init(struct request_queue *q, char *name) } if (!e) { - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (q->mq_ops && q->nr_hw_queues == 1) + e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false); + else if (q->mq_ops) + e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false); + else + e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ - "Using noop.\n"); + "Using noop/none.\n"); e = elevator_get("noop", false); } } - err = e->ops.elevator_init_fn(q, e); - if (err) + if (e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = e->ops.mq.init_sched(q, e); + } else + err = e->ops.sq.elevator_init_fn(q, e); + if (err) { + if (e->uses_mq) + blk_mq_sched_teardown(q); elevator_put(e); + } return err; } EXPORT_SYMBOL(elevator_init); @@ -234,8 +253,10 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->type->ops.elevator_exit_fn) - e->type->ops.elevator_exit_fn(e); + if (e->uses_mq && e->type->ops.mq.exit_sched) + e->type->ops.mq.exit_sched(e); + else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) + e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -253,6 +274,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq) if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } +EXPORT_SYMBOL_GPL(elv_rqhash_del); void elv_rqhash_add(struct request_queue *q, struct request *rq) { @@ -262,6 +284,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq) hash_add(e->hash, &rq->hash, rq_hash_key(rq)); rq->rq_flags |= RQF_HASHED; } +EXPORT_SYMBOL_GPL(elv_rqhash_add); void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { @@ -443,8 +466,10 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) return ELEVATOR_BACK_MERGE; } - if (e->type->ops.elevator_merge_fn) - return e->type->ops.elevator_merge_fn(q, req, bio); + if (e->uses_mq && e->type->ops.mq.request_merge) + return e->type->ops.mq.request_merge(q, req, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) + return e->type->ops.sq.elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; } @@ -456,8 +481,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * * Returns true if we merged, false otherwise */ -static bool elv_attempt_insert_merge(struct request_queue *q, - struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) { struct request *__rq; bool ret; @@ -495,8 +519,10 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_merged_fn) - e->type->ops.elevator_merged_fn(q, rq, type); + if (e->uses_mq && e->type->ops.mq.request_merged) + e->type->ops.mq.request_merged(q, rq, type); + else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) + e->type->ops.sq.elevator_merged_fn(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -508,10 +534,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - const int next_sorted = next->rq_flags & RQF_SORTED; - - if (next_sorted && e->type->ops.elevator_merge_req_fn) - e->type->ops.elevator_merge_req_fn(q, rq, next); + bool next_sorted = false; + + if (e->uses_mq && e->type->ops.mq.requests_merged) + e->type->ops.mq.requests_merged(q, rq, next); + else if (e->type->ops.sq.elevator_merge_req_fn) { + next_sorted = next->rq_flags & RQF_SORTED; + if (next_sorted) + e->type->ops.sq.elevator_merge_req_fn(q, rq, next); + } elv_rqhash_reposition(q, rq); @@ -528,8 +559,11 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_bio_merged_fn) - e->type->ops.elevator_bio_merged_fn(q, rq, bio); + if (WARN_ON_ONCE(e->uses_mq)) + return; + + if (e->type->ops.sq.elevator_bio_merged_fn) + e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); } #ifdef CONFIG_PM @@ -574,11 +608,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) void elv_drain_elevator(struct request_queue *q) { + struct elevator_queue *e = q->elevator; static int printed; + if (WARN_ON_ONCE(e->uses_mq)) + return; + lockdep_assert_held(q->queue_lock); - while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) + while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) ; if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " @@ -653,7 +691,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) * rq cannot be accessed after calling * elevator_add_req_fn. */ - q->elevator->type->ops.elevator_add_req_fn(q, rq); + q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); break; case ELEVATOR_INSERT_FLUSH: @@ -682,8 +720,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_latter_req_fn) - return e->type->ops.elevator_latter_req_fn(q, rq); + if (e->uses_mq && e->type->ops.mq.next_request) + return e->type->ops.mq.next_request(q, rq); + else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) + return e->type->ops.sq.elevator_latter_req_fn(q, rq); + return NULL; } @@ -691,8 +732,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_former_req_fn) - return e->type->ops.elevator_former_req_fn(q, rq); + if (e->uses_mq && e->type->ops.mq.former_request) + return e->type->ops.mq.former_request(q, rq); + if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) + return e->type->ops.sq.elevator_former_req_fn(q, rq); return NULL; } @@ -701,8 +744,11 @@ int elv_set_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + + if (e->type->ops.sq.elevator_set_req_fn) + return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -710,16 +756,22 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_put_req_fn) - e->type->ops.elevator_put_req_fn(rq); + if (WARN_ON_ONCE(e->uses_mq)) + return; + + if (e->type->ops.sq.elevator_put_req_fn) + e->type->ops.sq.elevator_put_req_fn(rq); } int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op); + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + + if (e->type->ops.sq.elevator_may_queue_fn) + return e->type->ops.sq.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } @@ -728,14 +780,17 @@ void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return; + /* * request is released from the driver, io must be done */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; if ((rq->rq_flags & RQF_SORTED) && - e->type->ops.elevator_completed_req_fn) - e->type->ops.elevator_completed_req_fn(q, rq); + e->type->ops.sq.elevator_completed_req_fn) + e->type->ops.sq.elevator_completed_req_fn(q, rq); } } @@ -803,8 +858,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (e->type->ops.elevator_registered_fn) - e->type->ops.elevator_registered_fn(q); + if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) + e->type->ops.sq.elevator_registered_fn(q); } return error; } @@ -891,9 +946,14 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old = q->elevator; - bool registered = old->registered; + bool old_registered = false; int err; + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } + /* * Turn on BYPASS and drain all requests w/ elevator private data. * Block layer doesn't call into a quiesced elevator - all requests @@ -901,42 +961,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * using INSERT_BACK. All requests have SOFTBARRIER set and no * merge happens either. */ - blk_queue_bypass_start(q); + if (old) { + old_registered = old->registered; - /* unregister and clear all auxiliary data of the old elevator */ - if (registered) - elv_unregister_queue(q); + if (old->uses_mq) + blk_mq_sched_teardown(q); - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); + if (!q->mq_ops) + blk_queue_bypass_start(q); + + /* unregister and clear all auxiliary data of the old elevator */ + if (old_registered) + elv_unregister_queue(q); + + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); + } /* allocate, init and register new elevator */ - err = new_e->ops.elevator_init_fn(q, new_e); - if (err) - goto fail_init; + if (new_e) { + if (new_e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = new_e->ops.mq.init_sched(q, new_e); + } else + err = new_e->ops.sq.elevator_init_fn(q, new_e); + if (err) + goto fail_init; - if (registered) { err = elv_register_queue(q); if (err) goto fail_register; - } + } else + q->elevator = NULL; /* done, kill the old one and finish */ - elevator_exit(old); - blk_queue_bypass_end(q); + if (old) { + elevator_exit(old); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } + + if (new_e) + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + else + blk_add_trace_msg(q, "elv switch: none"); return 0; fail_register: + if (q->mq_ops) + blk_mq_sched_teardown(q); elevator_exit(q->elevator); fail_init: /* switch failed, restore and re-register old elevator */ - q->elevator = old; - elv_register_queue(q); - blk_queue_bypass_end(q); + if (old) { + q->elevator = old; + elv_register_queue(q); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } return err; } @@ -949,8 +1043,11 @@ static int __elevator_change(struct request_queue *q, const char *name) char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; - if (!q->elevator) - return -ENXIO; + /* + * Special case for mq, turn off scheduling + */ + if (q->mq_ops && !strncmp(name, "none", 4)) + return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name), true); @@ -959,11 +1056,21 @@ static int __elevator_change(struct request_queue *q, const char *name) return -EINVAL; } - if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && + !strcmp(elevator_name, q->elevator->type->elevator_name)) { elevator_put(e); return 0; } + if (!e->uses_mq && q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + if (e->uses_mq && !q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + return elevator_switch(q, e); } @@ -985,7 +1092,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!q->elevator) + if (!(q->mq_ops || q->request_fn)) return count; ret = __elevator_change(q, name); @@ -999,24 +1106,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, ssize_t elv_iosched_show(struct request_queue *q, char *name) { struct elevator_queue *e = q->elevator; - struct elevator_type *elv; + struct elevator_type *elv = NULL; struct elevator_type *__e; int len = 0; - if (!q->elevator || !blk_queue_stackable(q)) + if (!blk_queue_stackable(q)) return sprintf(name, "none\n"); - elv = e->type; + if (!q->elevator) + len += sprintf(name+len, "[none] "); + else + elv = e->type; spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (!strcmp(elv->elevator_name, __e->elevator_name)) + if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); - else + continue; + } + if (__e->uses_mq && q->mq_ops) + len += sprintf(name+len, "%s ", __e->elevator_name); + else if (!__e->uses_mq && !q->mq_ops) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); + if (q->mq_ops && q->elevator) + len += sprintf(name+len, "none"); + len += sprintf(len+name, "\n"); return len; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c new file mode 100644 index 000000000000..d93ec713fa62 --- /dev/null +++ b/block/mq-deadline.c @@ -0,0 +1,555 @@ +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/elevator.h> +#include <linux/bio.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/rbtree.h> +#include <linux/sbitmap.h> + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* + * See Documentation/block/deadline-iosched.txt + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +struct deadline_data { + /* + * run time data + */ + + /* + * requests (deadline_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + /* + * next in sort order. read, write or both are NULL + */ + struct request *next_rq[2]; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[2]; + int fifo_batch; + int writes_starved; + int front_merges; + + spinlock_t lock; + struct list_head dispatch; +}; + +static inline struct rb_root * +deadline_rb_root(struct deadline_data *dd, struct request *rq) +{ + return &dd->sort_list[rq_data_dir(rq)]; +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(dd, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + if (dd->next_rq[data_dir] == rq) + dd->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(dd, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, struct request *rq) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(dd, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + int type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(dd, req), req); + deadline_add_rq_rb(dd, req); + } +} + +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + dd->next_rq[READ] = NULL; + dd->next_rq[WRITE] = NULL; + dd->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, rq); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +{ + struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + bool reads, writes; + int data_dir; + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + reads = !list_empty(&dd->fifo_list[READ]); + writes = !list_empty(&dd->fifo_list[WRITE]); + + /* + * batches are currently reads XOR writes + */ + if (dd->next_rq[WRITE]) + rq = dd->next_rq[WRITE]; + else + rq = dd->next_rq[READ]; + + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + + if (writes && (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + + dd->starved = 0; + + data_dir = WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = dd->next_rq[data_dir]; + } + + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, rq); +done: + rq->rq_flags |= RQF_STARTED; + return rq; +} + +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + + spin_lock(&dd->lock); + rq = __dd_dispatch_request(hctx); + spin_unlock(&dd->lock); + + return rq; +} + +static void dd_exit_queue(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + + BUG_ON(!list_empty(&dd->fifo_list[READ])); + BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + + kfree(dd); +} + +/* + * initialize elevator private data (deadline_data). + */ +static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = dd; + + INIT_LIST_HEAD(&dd->fifo_list[READ]); + INIT_LIST_HEAD(&dd->fifo_list[WRITE]); + dd->sort_list[READ] = RB_ROOT; + dd->sort_list[WRITE] = RB_ROOT; + dd->fifo_expire[READ] = read_expire; + dd->fifo_expire[WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + INIT_LIST_HEAD(&dd->dispatch); + + q->elevator = eq; + return 0; +} + +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + int ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio); + spin_unlock(&dd->lock); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const int data_dir = rq_data_dir(rq); + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + blk_mq_sched_request_inserted(rq); + + if (blk_mq_sched_bypass_insert(hctx, rq)) + return; + + if (at_head || rq->cmd_type != REQ_TYPE_FS) { + if (at_head) + list_add(&rq->queuelist, &dd->dispatch); + else + list_add_tail(&rq->queuelist, &dd->dispatch); + } else { + deadline_add_rq_rb(dd, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + } +} + +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + } + spin_unlock(&dd->lock); +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + + return !list_empty_careful(&dd->dispatch) || + !list_empty_careful(&dd->fifo_list[0]) || + !list_empty_careful(&dd->fifo_list[1]); +} + +/* + * sysfs parts below + */ +static ssize_t +deadline_var_show(int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +deadline_var_store(int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtol(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return deadline_var_show(__data, (page)); \ +} +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data; \ + int ret = deadline_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +#define DD_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ + deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +static struct elevator_type mq_deadline = { + .ops.mq = { + .insert_requests = dd_insert_requests, + .dispatch_request = dd_dispatch_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_queue, + .exit_sched = dd_exit_queue, + }, + + .uses_mq = true, + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_owner = THIS_MODULE, +}; + +static int __init deadline_init(void) +{ + return elv_register(&mq_deadline); +} + +static void __exit deadline_exit(void) +{ + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index a163c487cf38..2d1b15d89b45 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e) } static struct elevator_type elevator_noop = { - .ops = { + .ops.sq = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, diff --git a/block/opal_proto.h b/block/opal_proto.h new file mode 100644 index 000000000000..f40c9acf8895 --- /dev/null +++ b/block/opal_proto.h @@ -0,0 +1,452 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli <rafael.antognolli@intel.com> + * Scott Bauer <scott.bauer@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/types.h> + +#ifndef _OPAL_PROTO_H +#define _OPAL_PROTO_H + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +#define DTAERROR_NO_METHOD_STATUS 0x89 +#define GENERIC_HOST_SESSION_NUM 0x41 + +#define TPER_SYNC_SUPPORTED 0x01 + +#define TINY_ATOM_DATA_MASK 0x3F +#define TINY_ATOM_SIGNED 0x40 + +#define SHORT_ATOM_ID 0x80 +#define SHORT_ATOM_BYTESTRING 0x20 +#define SHORT_ATOM_SIGNED 0x10 +#define SHORT_ATOM_LEN_MASK 0xF + +#define MEDIUM_ATOM_ID 0xC0 +#define MEDIUM_ATOM_BYTESTRING 0x10 +#define MEDIUM_ATOM_SIGNED 0x8 +#define MEDIUM_ATOM_LEN_MASK 0x7 + +#define LONG_ATOM_ID 0xe0 +#define LONG_ATOM_BYTESTRING 0x2 +#define LONG_ATOM_SIGNED 0x1 + +/* Derived from TCG Core spec 2.01 Section: + * 3.2.2.1 + * Data Type + */ +#define TINY_ATOM_BYTE 0x7F +#define SHORT_ATOM_BYTE 0xBF +#define MEDIUM_ATOM_BYTE 0xDF +#define LONG_ATOM_BYTE 0xE3 + +#define OPAL_INVAL_PARAM 12 +#define OPAL_MANUFACTURED_INACTIVE 0x08 +#define OPAL_DISCOVERY_COMID 0x0001 + +#define LOCKING_RANGE_NON_GLOBAL 0x03 +/* + * User IDs used in the TCG storage SSCs + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +#define OPAL_UID_LENGTH 8 +#define OPAL_METHOD_LENGTH 8 +#define OPAL_MSID_KEYLEN 15 +#define OPAL_UID_LENGTH_HALF 4 + +/* Enum to index OPALUID array */ +enum opal_uid { + /* users */ + OPAL_SMUID_UID, + OPAL_THISSP_UID, + OPAL_ADMINSP_UID, + OPAL_LOCKINGSP_UID, + OPAL_ENTERPRISE_LOCKINGSP_UID, + OPAL_ANYBODY_UID, + OPAL_SID_UID, + OPAL_ADMIN1_UID, + OPAL_USER1_UID, + OPAL_USER2_UID, + OPAL_PSID_UID, + OPAL_ENTERPRISE_BANDMASTER0_UID, + OPAL_ENTERPRISE_ERASEMASTER_UID, + /* tables */ + OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + OPAL_LOCKINGRANGE_ACE_WRLOCKED, + OPAL_MBRCONTROL, + OPAL_MBR, + OPAL_AUTHORITY_TABLE, + OPAL_C_PIN_TABLE, + OPAL_LOCKING_INFO_TABLE, + OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + /* C_PIN_TABLE object ID's */ + OPAL_C_PIN_MSID, + OPAL_C_PIN_SID, + OPAL_C_PIN_ADMIN1, + /* half UID's (only first 4 bytes used) */ + OPAL_HALF_UID_AUTHORITY_OBJ_REF, + OPAL_HALF_UID_BOOLEAN_ACE, + /* omitted optional parameter */ + OPAL_UID_HEXFF, +}; + +#define OPAL_METHOD_LENGTH 8 + +/* Enum for indexing the OPALMETHOD array */ +enum opal_method { + OPAL_PROPERTIES, + OPAL_STARTSESSION, + OPAL_REVERT, + OPAL_ACTIVATE, + OPAL_EGET, + OPAL_ESET, + OPAL_NEXT, + OPAL_EAUTHENTICATE, + OPAL_GETACL, + OPAL_GENKEY, + OPAL_REVERTSP, + OPAL_GET, + OPAL_SET, + OPAL_AUTHENTICATE, + OPAL_RANDOM, + OPAL_ERASE, +}; + +enum opal_token { + /* Boolean */ + OPAL_TRUE = 0x01, + OPAL_FALSE = 0x00, + OPAL_BOOLEAN_EXPR = 0x03, + /* cellblocks */ + OPAL_TABLE = 0x00, + OPAL_STARTROW = 0x01, + OPAL_ENDROW = 0x02, + OPAL_STARTCOLUMN = 0x03, + OPAL_ENDCOLUMN = 0x04, + OPAL_VALUES = 0x01, + /* authority table */ + OPAL_PIN = 0x03, + /* locking tokens */ + OPAL_RANGESTART = 0x03, + OPAL_RANGELENGTH = 0x04, + OPAL_READLOCKENABLED = 0x05, + OPAL_WRITELOCKENABLED = 0x06, + OPAL_READLOCKED = 0x07, + OPAL_WRITELOCKED = 0x08, + OPAL_ACTIVEKEY = 0x0A, + /* locking info table */ + OPAL_MAXRANGES = 0x04, + /* mbr control */ + OPAL_MBRENABLE = 0x01, + OPAL_MBRDONE = 0x02, + /* properties */ + OPAL_HOSTPROPERTIES = 0x00, + /* atoms */ + OPAL_STARTLIST = 0xf0, + OPAL_ENDLIST = 0xf1, + OPAL_STARTNAME = 0xf2, + OPAL_ENDNAME = 0xf3, + OPAL_CALL = 0xf8, + OPAL_ENDOFDATA = 0xf9, + OPAL_ENDOFSESSION = 0xfa, + OPAL_STARTTRANSACTON = 0xfb, + OPAL_ENDTRANSACTON = 0xfC, + OPAL_EMPTYATOM = 0xff, + OPAL_WHERE = 0x00, +}; + +/* Locking state for a locking range */ +enum opal_lockingstate { + OPAL_LOCKING_READWRITE = 0x01, + OPAL_LOCKING_READONLY = 0x02, + OPAL_LOCKING_LOCKED = 0x03, +}; + +/* Packets derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Secion: 3.2.3 ComPackets, Packets & Subpackets + */ + +/* Comm Packet (header) for transmissions. */ +struct opal_compacket { + __be32 reserved0; + u8 extendedComID[4]; + __be32 outstandingData; + __be32 minTransfer; + __be32 length; +}; + +/* Packet structure. */ +struct opal_packet { + __be32 tsn; + __be32 hsn; + __be32 seq_number; + __be16 reserved0; + __be16 ack_type; + __be32 acknowledgment; + __be32 length; +}; + +/* Data sub packet header */ +struct opal_data_subpacket { + u8 reserved0[6]; + __be16 kind; + __be32 length; +}; + +/* header of a response */ +struct opal_header { + struct opal_compacket cp; + struct opal_packet pkt; + struct opal_data_subpacket subpkt; +}; + +#define FC_TPER 0x0001 +#define FC_LOCKING 0x0002 +#define FC_GEOMETRY 0x0003 +#define FC_ENTERPRISE 0x0100 +#define FC_DATASTORE 0x0202 +#define FC_SINGLEUSER 0x0201 +#define FC_OPALV100 0x0200 +#define FC_OPALV200 0x0203 + +/* + * The Discovery 0 Header. As defined in + * Opal SSC Documentation + * Section: 3.3.5 Capability Discovery + */ +struct d0_header { + __be32 length; /* the length of the header 48 in 2.00.100 */ + __be32 revision; /**< revision of the header 1 in 2.00.100 */ + __be32 reserved01; + __be32 reserved02; + /* + * the remainder of the structure is vendor specific and will not be + * addressed now + */ + u8 ignored[32]; +}; + +/* + * TPer Feature Descriptor. Contains flags indicating support for the + * TPer features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x001 in 2.00.100 + */ +struct d0_tper_features { + /* + * supported_features bits: + * bit 7: reserved + * bit 6: com ID management + * bit 5: reserved + * bit 4: streaming support + * bit 3: buffer management + * bit 2: ACK/NACK + * bit 1: async + * bit 0: sync + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Locking Feature Descriptor. Contains flags indicating support for the + * locking features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0002 in 2.00.100 + */ +struct d0_locking_features { + /* + * supported_features bits: + * bits 6-7: reserved + * bit 5: MBR done + * bit 4: MBR enabled + * bit 3: media encryption + * bit 2: locked + * bit 1: locking enabled + * bit 0: locking supported + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Geometry Feature Descriptor. Contains flags indicating support for the + * geometry features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0003 in 2.00.100 + */ +struct d0_geometry_features { + /* + * skip 32 bits from header, needed to align the struct to 64 bits. + */ + u8 header[4]; + /* + * reserved01: + * bits 1-6: reserved + * bit 0: align + */ + u8 reserved01; + u8 reserved02[7]; + __be32 logical_block_size; + __be64 alignment_granularity; + __be64 lowest_aligned_lba; +}; + +/* + * Enterprise SSC Feature + * + * code == 0x0100 + */ +struct d0_enterprise_ssc { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + u8 reserved01; + __be16 reserved02; + __be32 reserved03; + __be32 reserved04; +}; + +/* + * Opal V1 feature + * + * code == 0x0200 + */ +struct d0_opal_v100 { + __be16 baseComID; + __be16 numComIDs; +}; + +/* + * Single User Mode feature + * + * code == 0x0201 + */ +struct d0_single_user_mode { + __be32 num_locking_objects; + /* reserved01: + * bit 0: any + * bit 1: all + * bit 2: policy + * bits 3-7: reserved + */ + u8 reserved01; + u8 reserved02; + __be16 reserved03; + __be32 reserved04; +}; + +/* + * Additonal Datastores feature + * + * code == 0x0202 + */ +struct d0_datastore_table { + __be16 reserved01; + __be16 max_tables; + __be32 max_size_tables; + __be32 table_size_alignment; +}; + +/* + * OPAL 2.0 feature + * + * code == 0x0203 + */ +struct d0_opal_v200 { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + /* num_locking_admin_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_admin_auth[2]; + /* num_locking_user_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_user_auth[2]; + u8 initialPIN; + u8 revertedPIN; + u8 reserved01; + __be32 reserved02; +}; + +/* Union of features used to parse the discovery 0 response */ +struct d0_features { + __be16 code; + /* + * r_version bits: + * bits 4-7: version + * bits 0-3: reserved + */ + u8 r_version; + u8 length; + u8 features[]; +}; + +#endif /* _OPAL_PROTO_H */ diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bcd86e5cd546..39f70d968754 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, if (!gpt) return NULL; - count = le32_to_cpu(gpt->num_partition_entries) * + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; @@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; - u64 lastlba; + u64 lastlba, pt_size; if (!ptes) return 0; @@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); diff --git a/block/sed-opal.c b/block/sed-opal.c new file mode 100644 index 000000000000..d1c52ba4d62d --- /dev/null +++ b/block/sed-opal.c @@ -0,0 +1,2488 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Scott Bauer <scott.bauer@intel.com> + * Rafael Antognolli <rafael.antognolli@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/genhd.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <uapi/linux/sed-opal.h> +#include <linux/sed-opal.h> +#include <linux/string.h> +#include <linux/kdev_t.h> + +#include "opal_proto.h" + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +struct opal_dev { + bool supported; + + void *data; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + + +static const u8 opaluid[][OPAL_UID_LENGTH] = { + /* users */ + [OPAL_SMUID_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [OPAL_THISSP_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ADMINSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [OPAL_ENTERPRISE_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_ANYBODY_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_SID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ADMIN1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_USER1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [OPAL_USER2_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + [OPAL_PSID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + [OPAL_ENTERPRISE_BANDMASTER0_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 }, + [OPAL_ENTERPRISE_ERASEMASTER_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, + + /* tables */ + + [OPAL_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [OPAL_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [OPAL_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [OPAL_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + + /* C_PIN_TABLE object ID's */ + + [OPAL_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [OPAL_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [OPAL_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + + [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [OPAL_HALF_UID_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, + + /* special value for omitted optional parameter */ + [OPAL_UID_HEXFF] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, +}; + +/* + * TCG Storage SSC Methods. + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +static const u8 opalmethod[][OPAL_UID_LENGTH] = { + [OPAL_PROPERTIES] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [OPAL_STARTSESSION] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [OPAL_REVERT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [OPAL_ACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [OPAL_EGET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ESET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 }, + [OPAL_NEXT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [OPAL_EAUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c }, + [OPAL_GETACL] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [OPAL_GENKEY] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [OPAL_REVERTSP] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [OPAL_GET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [OPAL_SET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [OPAL_AUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [OPAL_RANDOM] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [OPAL_ERASE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +typedef int (cont_fn)(struct opal_dev *dev); + +static int end_opal_session_error(struct opal_dev *dev); + +struct opal_suspend_data { + struct opal_lock_unlock unlk; + u8 lr; + struct list_head node; +}; + +/* + * Derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 5.1.5 Method Status Codes + */ +static const char * const opal_errors[] = { + "Success", + "Not Authorized", + "Unknown Error", + "SP Busy", + "SP Failed", + "SP Disabled", + "SP Frozen", + "No Sessions Available", + "Uniqueness Conflict", + "Insufficient Space", + "Insufficient Rows", + "Invalid Function", + "Invalid Parameter", + "Invalid Reference", + "Unknown Error", + "TPER Malfunction", + "Transaction Failure", + "Response Overflow", + "Authority Locked Out", +}; + +static const char *opal_error_to_human(int error) +{ + if (error == 0x3f) + return "Failed"; + + if (error >= ARRAY_SIZE(opal_errors) || error < 0) + return "Unknown Error"; + + return opal_errors[error]; +} + +static void print_buffer(const u8 *ptr, u32 length) +{ +#ifdef DEBUG + print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length); + pr_debug("\n"); +#endif +} + +static bool check_tper(const void *data) +{ + const struct d0_tper_features *tper = data; + u8 flags = tper->supported_features; + + if (!(flags & TPER_SYNC_SUPPORTED)) { + pr_err("TPer sync not supported. flags = %d\n", + tper->supported_features); + return false; + } + + return true; +} + +static bool check_sum(const void *data) +{ + const struct d0_single_user_mode *sum = data; + u32 nlo = be32_to_cpu(sum->num_locking_objects); + + if (nlo == 0) { + pr_err("Need at least one locking object.\n"); + return false; + } + + pr_debug("Number of locking objects: %d\n", nlo); + + return true; +} + +static u16 get_comid_v100(const void *data) +{ + const struct d0_opal_v100 *v100 = data; + + return be16_to_cpu(v100->baseComID); +} + +static u16 get_comid_v200(const void *data) +{ + const struct d0_opal_v200 *v200 = data; + + return be16_to_cpu(v200->baseComID); +} + +static int opal_send_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->cmd, IO_BUFFER_LENGTH, + true); +} + +static int opal_recv_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->resp, IO_BUFFER_LENGTH, + false); +} + +static int opal_recv_check(struct opal_dev *dev) +{ + size_t buflen = IO_BUFFER_LENGTH; + void *buffer = dev->resp; + struct opal_header *hdr = buffer; + int ret; + + do { + pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n", + hdr->cp.outstandingData, + hdr->cp.minTransfer); + + if (hdr->cp.outstandingData == 0 || + hdr->cp.minTransfer != 0) + return 0; + + memset(buffer, 0, buflen); + ret = opal_recv_cmd(dev); + } while (!ret); + + return ret; +} + +static int opal_send_recv(struct opal_dev *dev, cont_fn *cont) +{ + int ret; + + ret = opal_send_cmd(dev); + if (ret) + return ret; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + ret = opal_recv_check(dev); + if (ret) + return ret; + return cont(dev); +} + +static void check_geometry(struct opal_dev *dev, const void *data) +{ + const struct d0_geometry_features *geo = data; + + dev->align = geo->alignment_granularity; + dev->lowest_lba = geo->lowest_aligned_lba; +} + +static int next(struct opal_dev *dev) +{ + opal_step func; + int error = 0; + + do { + func = dev->funcs[dev->state]; + if (!func) + break; + + error = func(dev); + if (error) { + pr_err("Error on step function: %d with error %d: %s\n", + dev->state, error, + opal_error_to_human(error)); + + /* For each OPAL command we do a discovery0 then we + * start some sort of session. + * If we haven't passed state 1 then there was an error + * on discovery0 or during the attempt to start a + * session. Therefore we shouldn't attempt to terminate + * a session, as one has not yet been created. + */ + if (dev->state > 1) + return end_opal_session_error(dev); + } + dev->state++; + } while (!error); + + return error; +} + +static int opal_discovery0_end(struct opal_dev *dev) +{ + bool found_com_id = false, supported = true, single_user = false; + const struct d0_header *hdr = (struct d0_header *)dev->resp; + const u8 *epos = dev->resp, *cpos = dev->resp; + u16 comid = 0; + + print_buffer(dev->resp, be32_to_cpu(hdr->length)); + + epos += be32_to_cpu(hdr->length); /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos && supported) { + const struct d0_features *body = + (const struct d0_features *)cpos; + + switch (be16_to_cpu(body->code)) { + case FC_TPER: + supported = check_tper(body->features); + break; + case FC_SINGLEUSER: + single_user = check_sum(body->features); + break; + case FC_GEOMETRY: + check_geometry(dev, body); + break; + case FC_LOCKING: + case FC_ENTERPRISE: + case FC_DATASTORE: + /* some ignored properties */ + pr_debug("Found OPAL feature description: %d\n", + be16_to_cpu(body->code)); + break; + case FC_OPALV100: + comid = get_comid_v100(body->features); + found_com_id = true; + break; + case FC_OPALV200: + comid = get_comid_v200(body->features); + found_com_id = true; + break; + case 0xbfff ... 0xffff: + /* vendor specific, just ignore */ + break; + default: + pr_debug("OPAL Unknown feature: %d\n", + be16_to_cpu(body->code)); + + } + cpos += body->length + 4; + } + + if (!supported) { + pr_debug("This device is not Opal enabled. Not Supported!\n"); + return -EOPNOTSUPP; + } + + if (!single_user) + pr_debug("Device doesn't support single user mode\n"); + + + if (!found_com_id) { + pr_debug("Could not find OPAL comid for device. Returning early\n"); + return -EOPNOTSUPP;; + } + + dev->comid = comid; + + return 0; +} + +static int opal_discovery0(struct opal_dev *dev) +{ + int ret; + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + dev->comid = OPAL_DISCOVERY_COMID; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + return opal_discovery0_end(dev); +} + +static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) +{ + if (*err) + return; + if (cmd->pos >= IO_BUFFER_LENGTH - 1) { + pr_err("Error adding u8: end of buffer.\n"); + *err = -ERANGE; + return; + } + cmd->cmd[cmd->pos++] = tok; +} + +static void add_short_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 atom; + int err = 0; + + atom = SHORT_ATOM_ID; + atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0; + atom |= has_sign ? SHORT_ATOM_SIGNED : 0; + atom |= len & SHORT_ATOM_LEN_MASK; + + add_token_u8(&err, cmd, atom); +} + +static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 header0; + + header0 = MEDIUM_ATOM_ID; + header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; + header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; + header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + cmd->cmd[cmd->pos++] = header0; + cmd->cmd[cmd->pos++] = len; +} + +static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) +{ + + size_t len; + int msb; + u8 n; + + if (!(number & ~TINY_ATOM_DATA_MASK)) { + add_token_u8(err, cmd, number); + return; + } + + msb = fls(number); + len = DIV_ROUND_UP(msb, 4); + + if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { + pr_err("Error adding u64: end of buffer.\n"); + *err = -ERANGE; + return; + } + add_short_atom_header(cmd, false, false, len); + while (len--) { + n = number >> (len * 8); + add_token_u8(err, cmd, n); + } +} + +static void add_token_bytestring(int *err, struct opal_dev *cmd, + const u8 *bytestring, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (*err) + return; + + if (len & ~SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { + pr_err("Error adding bytestring: end of buffer.\n"); + *err = -ERANGE; + return; + } + + if (is_short_atom) + add_short_atom_header(cmd, true, false, len); + else + add_medium_atom_header(cmd, true, false, len); + + memcpy(&cmd->cmd[cmd->pos], bytestring, len); + cmd->pos += len; + +} + +static int build_locking_range(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range. Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + if (lr == 0) + return 0; + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = lr; + + return 0; +} + +static int build_locking_user(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range user, Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + buffer[7] = lr + 1; + + return 0; +} + +static void set_comid(struct opal_dev *cmd, u16 comid) +{ + struct opal_header *hdr = (struct opal_header *)cmd->cmd; + + hdr->cp.extendedComID[0] = comid >> 8; + hdr->cp.extendedComID[1] = comid; + hdr->cp.extendedComID[2] = 0; + hdr->cp.extendedComID[3] = 0; +} + +static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) +{ + struct opal_header *hdr; + int err = 0; + + add_token_u8(&err, cmd, OPAL_ENDOFDATA); + add_token_u8(&err, cmd, OPAL_STARTLIST); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, OPAL_ENDLIST); + + if (err) { + pr_err("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct opal_header *) cmd->cmd; + + hdr->pkt.tsn = cpu_to_be32(tsn); + hdr->pkt.hsn = cpu_to_be32(hsn); + + hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); + while (cmd->pos % 4) { + if (cmd->pos >= IO_BUFFER_LENGTH) { + pr_err("Error: Buffer overrun\n"); + return -ERANGE; + } + cmd->cmd[cmd->pos++] = 0; + } + hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) - + sizeof(hdr->pkt)); + hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp)); + + return 0; +} + +static enum opal_response_token token_type(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return OPAL_DTA_TOKENID_INVALID; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return OPAL_DTA_TOKENID_INVALID; + } + + return tok->type; +} + +/* + * This function returns 0 in case of invalid token. One should call + * token_type() first to find out if the token is valid or not. + */ +static enum opal_token response_get_token(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return 0; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return 0; + } + + return tok->pos[0]; +} + +static size_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->width = OPAL_WIDTH_TINY; + + if (pos[0] & TINY_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + tok->type = OPAL_DTA_TOKENID_UINT; + tok->stored.u = pos[0] & 0x3f; + } + + return tok->len; +} + +static size_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; + tok->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SHORT_ATOM_BYTESTRING) { + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SHORT_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + u64 u_integer = 0; + int i, b = 0; + + tok->type = OPAL_DTA_TOKENID_UINT; + if (tok->len > 9) { + pr_warn("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = tok->len - 1; i > 0; i--) { + u_integer |= ((u64)pos[i] << (8 * b)); + b++; + } + tok->stored.u = u_integer; + } + + return tok->len; +} + +static size_t response_parse_medium(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; + tok->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & MEDIUM_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & MEDIUM_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; + tok->width = OPAL_WIDTH_LONG; + + if (pos[0] & LONG_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & LONG_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->type = OPAL_DTA_TOKENID_TOKEN; + tok->width = OPAL_WIDTH_TOKEN; + + return tok->len; +} + +static int response_parse(const u8 *buf, size_t length, + struct parsed_resp *resp) +{ + const struct opal_header *hdr; + struct opal_resp_tok *iter; + int num_entries = 0; + int total; + size_t token_length; + const u8 *pos; + + if (!buf) + return -EFAULT; + + if (!resp) + return -EFAULT; + + hdr = (struct opal_header *)buf; + pos = buf; + pos += sizeof(*hdr); + + pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + + if (hdr->cp.length == 0 || hdr->pkt.length == 0 || + hdr->subpkt.length == 0) { + pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + print_buffer(pos, sizeof(*hdr)); + return -EINVAL; + } + + if (pos > buf + length) + return -EFAULT; + + iter = resp->toks; + total = be32_to_cpu(hdr->subpkt.length); + print_buffer(pos, total); + while (total > 0) { + if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ + token_length = response_parse_tiny(iter, pos); + else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */ + token_length = response_parse_short(iter, pos); + else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */ + token_length = response_parse_medium(iter, pos); + else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ + token_length = response_parse_long(iter, pos); + else /* TOKEN */ + token_length = response_parse_token(iter, pos); + + if (token_length == -EINVAL) + return -EINVAL; + + pos += token_length; + total -= token_length; + iter++; + num_entries++; + } + + if (num_entries == 0) { + pr_err("Couldn't parse response.\n"); + return -EINVAL; + } + resp->num = num_entries; + + return 0; +} + +static size_t response_get_string(const struct parsed_resp *resp, int n, + const char **store) +{ + *store = NULL; + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { + pr_err("Token is not a byte string!\n"); + return 0; + } + + *store = resp->toks[n].pos + 1; + return resp->toks[n].len - 1; +} + +static u64 response_get_u64(const struct parsed_resp *resp, int n) +{ + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { + pr_err("Token is not unsigned it: %d\n", + resp->toks[n].type); + return 0; + } + + if (!(resp->toks[n].width == OPAL_WIDTH_TINY || + resp->toks[n].width == OPAL_WIDTH_SHORT)) { + pr_err("Atom is not short or tiny: %d\n", + resp->toks[n].width); + return 0; + } + + return resp->toks[n].stored.u; +} + +static u8 response_status(const struct parsed_resp *resp) +{ + if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && + response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + return 0; + } + + if (resp->num < 5) + return DTAERROR_NO_METHOD_STATUS; + + if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || + token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || + response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || + response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + return DTAERROR_NO_METHOD_STATUS; + + return response_get_u64(resp, resp->num - 4); +} + +/* Parses and checks for errors */ +static int parse_and_check_status(struct opal_dev *dev) +{ + int error; + + print_buffer(dev->cmd, dev->pos); + + error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); + if (error) { + pr_err("Couldn't parse response.\n"); + return error; + } + + return response_status(&dev->parsed); +} + +static void clear_opal_cmd(struct opal_dev *dev) +{ + dev->pos = sizeof(struct opal_header); + memset(dev->cmd, 0, IO_BUFFER_LENGTH); +} + +static int start_opal_session_cont(struct opal_dev *dev) +{ + u32 hsn, tsn; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + hsn = response_get_u64(&dev->parsed, 4); + tsn = response_get_u64(&dev->parsed, 5); + + if (hsn == 0 && tsn == 0) { + pr_err("Couldn't authenticate session\n"); + return -EPERM; + } + + dev->hsn = hsn; + dev->tsn = tsn; + return 0; +} + +static void add_suspend_info(struct opal_dev *dev, + struct opal_suspend_data *sus) +{ + struct opal_suspend_data *iter; + + list_for_each_entry(iter, &dev->unlk_lst, node) { + if (iter->lr == sus->lr) { + list_del(&iter->node); + kfree(iter); + break; + } + } + list_add_tail(&sus->node, &dev->unlk_lst); +} + +static int end_session_cont(struct opal_dev *dev) +{ + dev->hsn = 0; + dev->tsn = 0; + return parse_and_check_status(dev); +} + +static int finalize_and_send(struct opal_dev *dev, cont_fn cont) +{ + int ret; + + ret = cmd_finalize(dev, dev->hsn, dev->tsn); + if (ret) { + pr_err("Error finalizing command buffer: %d\n", ret); + return ret; + } + + print_buffer(dev->cmd, dev->pos); + + return opal_send_recv(dev, cont); +} + +static int gen_key(struct opal_dev *dev) +{ + const u8 *method; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); + method = opalmethod[OPAL_GENKEY]; + kfree(dev->prev_data); + dev->prev_data = NULL; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building gen key command\n"); + return err; + + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_active_key_cont(struct opal_dev *dev) +{ + const char *activekey; + size_t keylen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + keylen = response_get_string(&dev->parsed, 4, &activekey); + if (!activekey) { + pr_err("%s: Couldn't extract the Activekey from the response\n", + __func__); + return OPAL_INVAL_PARAM; + } + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); + + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = keylen; + + return 0; +} + +static int get_active_key(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + u8 *lr; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + lr = dev->func_data[dev->state]; + + err = build_locking_range(uid, sizeof(uid), *lr); + if (err) + return err; + + err = 0; + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* startCloumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* endColumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building get active key command\n"); + return err; + } + + return finalize_and_send(dev, get_active_key_cont); +} + +static int generic_lr_enable_disable(struct opal_dev *dev, + u8 *uid, bool rle, bool wle, + bool rl, bool wl) +{ + int err = 0; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* ReadLockEnabled */ + add_token_u8(&err, dev, rle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /* WriteLockEnabled */ + add_token_u8(&err, dev, wle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, rl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, wl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + return err; +} + +static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, + struct opal_user_lr_setup *setup) +{ + int err; + + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, + 0, 0); + if (err) + pr_err("Failed to create enable global lr command\n"); + return err; +} + +static int setup_locking_range(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup; + u8 lr; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + setup = dev->func_data[dev->state]; + lr = setup->session.opal_key.lr; + err = build_locking_range(uid, sizeof(uid), lr); + if (err) + return err; + + if (lr == 0) + err = enable_global_lr(dev, uid, setup); + else { + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Ranges Start */ + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* Ranges length */ + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /*ReadLockEnabled */ + add_token_u64(&err, dev, !!setup->RLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /*WriteLockEnabled*/ + add_token_u64(&err, dev, !!setup->WLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } + if (err) { + pr_err("Error building Setup Locking range command.\n"); + return err; + + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int start_generic_opal_session(struct opal_dev *dev, + enum opal_uid auth, + enum opal_uid sp_type, + const char *key, + u8 key_len) +{ + u32 hsn; + int err = 0; + + if (key == NULL && auth != OPAL_ANYBODY_UID) { + pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \ + "Challenge, and not as the Anybody UID\n", __func__); + return OPAL_INVAL_PARAM; + } + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + hsn = GENERIC_HOST_SESSION_NUM; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + + switch (auth) { + case OPAL_ANYBODY_UID: + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + case OPAL_ADMIN1_UID: + case OPAL_SID_UID: + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); /* HostChallenge */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* HostSignAuth */ + add_token_bytestring(&err, dev, opaluid[auth], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + default: + pr_err("Cannot start Admin SP session with auth %d\n", auth); + return OPAL_INVAL_PARAM; + } + + if (err) { + pr_err("Error building start adminsp session command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int start_anybodyASP_opal_session(struct opal_dev *dev) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_ADMINSP_UID, NULL, 0); +} + +static int start_SIDASP_opal_session(struct opal_dev *dev) +{ + int ret; + const u8 *key = dev->prev_data; + struct opal_key *okey; + + if (!key) { + okey = dev->func_data[dev->state]; + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); + } else { + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + key, dev->prev_d_len); + kfree(key); + dev->prev_data = NULL; + } + return ret; +} + +static inline int start_admin1LSP_opal_session(struct opal_dev *dev) +{ + struct opal_key *key = dev->func_data[dev->state]; + + return start_generic_opal_session(dev, OPAL_ADMIN1_UID, + OPAL_LOCKINGSP_UID, + key->key, key->key_len); +} + +static int start_auth_opal_session(struct opal_dev *dev) +{ + u8 lk_ul_user[OPAL_UID_LENGTH]; + int err = 0; + + struct opal_session_info *session = dev->func_data[dev->state]; + size_t keylen = session->opal_key.key_len; + u8 *key = session->opal_key.key; + u32 hsn = GENERIC_HOST_SESSION_NUM; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + if (session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->opal_key.lr); + if (err) + return err; + + } else if (session->who != OPAL_ADMIN1 && !session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->who - 1); + if (err) + return err; + } else + memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); + add_token_bytestring(&err, dev, key, keylen); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building STARTSESSION command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int revert_tper(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building REVERT TPER command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int internal_activate_user(struct opal_dev *dev) +{ + struct opal_session_info *session = dev->func_data[dev->state]; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + uid[7] = session->who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* Enabled */ + add_token_u8(&err, dev, OPAL_TRUE); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Activate UserN command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int erase_locking_range(struct opal_dev *dev) +{ + struct opal_session_info *session; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + session = dev->func_data[dev->state]; + + if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0) + return -ERANGE; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Erase Locking Range Command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_done(struct opal_dev *dev) +{ + u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 2); /* Done */ + add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR Done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_enable_disable(struct opal_dev *dev) +{ + u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, mbr_en_dis); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, + struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* PIN */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + return err; +} + +static int set_new_pw(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_session_info *usr = dev->func_data[dev->state]; + + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH); + + if (usr->who != OPAL_ADMIN1) { + cpin_uid[5] = 0x03; + if (usr->sum) + cpin_uid[7] = usr->opal_key.lr + 1; + else + cpin_uid[7] = usr->who; + } + + if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, + cpin_uid, dev)) { + pr_err("Error building set password command.\n"); + return -ERANGE; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_sid_cpin_pin(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_key *key = dev->func_data[dev->state]; + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); + + if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { + pr_err("Error building Set SID cpin\n"); + return -ERANGE; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int add_user_to_lr(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 user_uid[OPAL_UID_LENGTH]; + struct opal_lock_unlock *lkul; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + lkul = dev->func_data[dev->state]; + + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], + OPAL_UID_LENGTH); + + if (lkul->l_state == OPAL_RW) + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], + OPAL_UID_LENGTH); + + lr_buffer[7] = lkul->session.opal_key.lr; + + memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + user_uid[7] = lkul->session.who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + + add_token_u8(&err, dev, OPAL_STARTLIST); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building add user to locking range command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int lock_unlock_locking_range(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + const u8 *method; + struct opal_lock_unlock *lkul; + u8 read_locked = 1, write_locked = 1; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state... returning to uland\n"); + return OPAL_INVAL_PARAM; + } + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, read_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, write_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building SET command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + + +static int lock_unlock_locking_range_sum(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 read_locked = 1, write_locked = 1; + const u8 *method; + struct opal_lock_unlock *lkul; + int ret; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state.\n"); + return OPAL_INVAL_PARAM; + } + ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, + read_locked, write_locked); + + if (ret < 0) { + pr_err("Error building SET command.\n"); + return ret; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int activate_lsp(struct opal_dev *dev) +{ + struct opal_lr_act *opal_act; + u8 user_lr[OPAL_UID_LENGTH]; + u8 uint_3 = 0x83; + int err = 0, i; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + opal_act = dev->func_data[dev->state]; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE], + OPAL_UID_LENGTH); + + + if (opal_act->sum) { + err = build_locking_range(user_lr, sizeof(user_lr), + opal_act->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, uint_3); + add_token_u8(&err, dev, 6); + add_token_u8(&err, dev, 0); + add_token_u8(&err, dev, 0); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_act->num_lrs; i++) { + user_lr[7] = opal_act->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } else { + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + } + + if (err) { + pr_err("Error building Activate LockingSP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_lsp_lifecycle_cont(struct opal_dev *dev) +{ + u8 lc_status; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + lc_status = response_get_u64(&dev->parsed, 4); + /* 0x08 is Manufacured Inactive */ + /* 0x09 is Manufactured */ + if (lc_status != OPAL_MANUFACTURED_INACTIVE) { + pr_err("Couldn't determine the status of the Lifcycle state\n"); + return -ENODEV; + } + + return 0; +} + +/* Determine if we're in the Manufactured Inactive or Active state */ +static int get_lsp_lifecycle(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building GET Lifecycle Status command\n"); + return err; + } + + return finalize_and_send(dev, get_lsp_lifecycle_cont); +} + +static int get_msid_cpin_pin_cont(struct opal_dev *dev) +{ + const char *msid_pin; + size_t strlen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + strlen = response_get_string(&dev->parsed, 4, &msid_pin); + if (!msid_pin) { + pr_err("%s: Couldn't extract PIN from response\n", __func__); + return OPAL_INVAL_PARAM; + } + + dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL); + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = strlen; + + return 0; +} + +static int get_msid_cpin_pin(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 3); /* PIN */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 3); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Get MSID CPIN PIN command.\n"); + return err; + } + + return finalize_and_send(dev, get_msid_cpin_pin_cont); +} + +static int build_end_opal_session(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + add_token_u8(&err, dev, OPAL_ENDOFSESSION); + return err; +} + +static int end_opal_session(struct opal_dev *dev) +{ + int ret = build_end_opal_session(dev); + + if (ret < 0) + return ret; + return finalize_and_send(dev, end_session_cont); +} + +static int end_opal_session_error(struct opal_dev *dev) +{ + const opal_step error_end_session[] = { + end_opal_session, + NULL, + }; + dev->funcs = error_end_session; + dev->state = 0; + return next(dev); +} + +static inline void setup_opal_dev(struct opal_dev *dev, + const opal_step *funcs) +{ + dev->state = 0; + dev->funcs = funcs; + dev->tsn = 0; + dev->hsn = 0; + dev->func_data = NULL; + dev->prev_data = NULL; +} + +static int check_opal_support(struct opal_dev *dev) +{ + static const opal_step funcs[] = { + opal_discovery0, + NULL + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + ret = next(dev); + dev->supported = !ret; + mutex_unlock(&dev->dev_lock); + return ret; +} + +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) +{ + struct opal_dev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + + INIT_LIST_HEAD(&dev->unlk_lst); + mutex_init(&dev->dev_lock); + dev->data = data; + dev->send_recv = send_recv; + if (check_opal_support(dev) != 0) { + pr_debug("Opal is not supported on this device\n"); + kfree(dev); + return NULL; + } + return dev; +} +EXPORT_SYMBOL(init_opal_dev); + +static int opal_secure_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + get_active_key, + gen_key, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = &opal_session->opal_key.lr; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + erase_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = opal_session; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, + struct opal_mbr_data *opal_mbr) +{ + void *func_data[6] = { NULL }; + static const opal_step mbr_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + set_mbr_done, + end_opal_session, + start_admin1LSP_opal_session, + set_mbr_enable_disable, + end_opal_session, + NULL, + }; + int ret; + + if (opal_mbr->enable_disable != OPAL_MBR_ENABLE && + opal_mbr->enable_disable != OPAL_MBR_DISABLE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, mbr_funcs); + dev->func_data = func_data; + dev->func_data[1] = &opal_mbr->key; + dev->func_data[2] = &opal_mbr->enable_disable; + dev->func_data[4] = &opal_mbr->key; + dev->func_data[5] = &opal_mbr->enable_disable; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *suspend; + + suspend = kzalloc(sizeof(*suspend), GFP_KERNEL); + if (!suspend) + return -ENOMEM; + + suspend->unlk = *lk_unlk; + suspend->lr = lk_unlk->session.opal_key.lr; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + add_suspend_info(dev, suspend); + mutex_unlock(&dev->dev_lock); + return 0; +} + +static int opal_add_user_to_lr(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + static const opal_step funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + add_user_to_lr, + end_opal_session, + NULL + }; + int ret; + + if (lk_unlk->l_state != OPAL_RO && + lk_unlk->l_state != OPAL_RW) { + pr_err("Locking state was not RO or RW\n"); + return -EINVAL; + } + if (lk_unlk->session.who < OPAL_USER1 && + lk_unlk->session.who > OPAL_USER9) { + pr_err("Authority was not within the range of users: %d\n", + lk_unlk->session.who); + return -EINVAL; + } + if (lk_unlk->session.sum) { + pr_err("%s not supported in sum. Use setup locking range\n", + __func__); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session.opal_key; + dev->func_data[2] = lk_unlk; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) +{ + void *data[2] = { NULL }; + static const opal_step revert_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, + revert_tper, /* controller will terminate session */ + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, revert_funcs); + dev->func_data = data; + dev->func_data[1] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int __opal_lock_unlock_sum(struct opal_dev *dev) +{ + static const opal_step ulk_funcs_sum[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range_sum, + end_opal_session, + NULL + }; + + dev->funcs = ulk_funcs_sum; + return next(dev); +} + +static int __opal_lock_unlock(struct opal_dev *dev) +{ + static const opal_step _unlock_funcs[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range, + end_opal_session, + NULL + }; + + dev->funcs = _unlock_funcs; + return next(dev); +} + +static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + int ret; + + if (lk_unlk->session.who < OPAL_ADMIN1 || + lk_unlk->session.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session; + dev->func_data[2] = lk_unlk; + + if (lk_unlk->session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) +{ + static const opal_step owner_funcs[] = { + opal_discovery0, + start_anybodyASP_opal_session, + get_msid_cpin_pin, + end_opal_session, + start_SIDASP_opal_session, + set_sid_cpin_pin, + end_opal_session, + NULL + }; + void *data[6] = { NULL }; + int ret; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, owner_funcs); + dev->func_data = data; + dev->func_data[4] = opal; + dev->func_data[5] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act) +{ + void *data[4] = { NULL }; + static const opal_step active_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, /* Open session as SID auth */ + get_lsp_lifecycle, + activate_lsp, + end_opal_session, + NULL + }; + int ret; + + if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, active_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lr_act->key; + dev->func_data[3] = opal_lr_act; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_setup_locking_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + void *data[3] = { NULL }; + static const opal_step lr_funcs[] = { + opal_discovery0, + start_auth_opal_session, + setup_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, lr_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lrs->session; + dev->func_data[2] = opal_lrs; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + static const opal_step pw_funcs[] = { + opal_discovery0, + start_auth_opal_session, + set_new_pw, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + if (opal_pw->session.who < OPAL_ADMIN1 || + opal_pw->session.who > OPAL_USER9 || + opal_pw->new_user_pw.who < OPAL_ADMIN1 || + opal_pw->new_user_pw.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, pw_funcs); + dev->func_data = data; + dev->func_data[1] = (void *) &opal_pw->session; + dev->func_data[2] = (void *) &opal_pw->new_user_pw; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_user(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + static const opal_step act_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + internal_activate_user, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + /* We can't activate Admin1 it's active as manufactured */ + if (opal_session->who < OPAL_USER1 && + opal_session->who > OPAL_USER9) { + pr_err("Who was not a valid user: %d\n", opal_session->who); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, act_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_session->opal_key; + dev->func_data[2] = opal_session; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + struct opal_suspend_data *suspend; + void *func_data[3] = { NULL }; + bool was_failure = false; + int ret = 0; + + if (!dev) + return false; + if (!dev->supported) + return false; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + + list_for_each_entry(suspend, &dev->unlk_lst, node) { + dev->state = 0; + dev->func_data[1] = &suspend->unlk.session; + dev->func_data[2] = &suspend->unlk; + dev->tsn = 0; + dev->hsn = 0; + + if (suspend->unlk.session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + if (ret) { + pr_warn("Failed to unlock LR %hhu with sum %d\n", + suspend->unlk.session.opal_key.lr, + suspend->unlk.session.sum); + was_failure = true; + } + } + mutex_unlock(&dev->dev_lock); + return was_failure; +} +EXPORT_SYMBOL(opal_unlock_from_suspend); + +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) +{ + void *p; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!dev) + return -ENOTSUPP; + if (!dev->supported) { + pr_err("Not supported\n"); + return -ENOTSUPP; + } + + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (cmd) { + case IOC_OPAL_SAVE: + ret = opal_save(dev, p); + break; + case IOC_OPAL_LOCK_UNLOCK: + ret = opal_lock_unlock(dev, p); + break; + case IOC_OPAL_TAKE_OWNERSHIP: + ret = opal_take_ownership(dev, p); + break; + case IOC_OPAL_ACTIVATE_LSP: + ret = opal_activate_lsp(dev, p); + break; + case IOC_OPAL_SET_PW: + ret = opal_set_new_pw(dev, p); + break; + case IOC_OPAL_ACTIVATE_USR: + ret = opal_activate_user(dev, p); + break; + case IOC_OPAL_REVERT_TPR: + ret = opal_reverttper(dev, p); + break; + case IOC_OPAL_LR_SETUP: + ret = opal_setup_locking_range(dev, p); + break; + case IOC_OPAL_ADD_USR_TO_LR: + ret = opal_add_user_to_lr(dev, p); + break; + case IOC_OPAL_ENABLE_DISABLE_MBR: + ret = opal_enable_disable_shadow_mbr(dev, p); + break; + case IOC_OPAL_ERASE_LR: + ret = opal_erase_locking_range(dev, p); + break; + case IOC_OPAL_SECURE_ERASE_LR: + ret = opal_secure_erase_locking_range(dev, p); + break; + default: + pr_warn("No such Opal Ioctl %u\n", cmd); + } + + kfree(p); + return ret; +} +EXPORT_SYMBOL_GPL(sed_ioctl); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e5c5b8eb14a9..3a44438a1195 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4074,41 +4074,27 @@ clean_up: static void cciss_interrupt_mode(ctlr_info_t *h) { -#ifdef CONFIG_PCI_MSI - int err; - struct msix_entry cciss_msix_entries[4] = { {0, 0}, {0, 1}, - {0, 2}, {0, 3} - }; + int ret; /* Some boards advertise MSI but don't really support it */ if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) || (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11)) goto default_int_mode; - if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { - err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); - if (!err) { - h->intr[0] = cciss_msix_entries[0].vector; - h->intr[1] = cciss_msix_entries[1].vector; - h->intr[2] = cciss_msix_entries[2].vector; - h->intr[3] = cciss_msix_entries[3].vector; - h->msix_vector = 1; - return; - } else { - dev_warn(&h->pdev->dev, - "MSI-X init failed %d\n", err); - } - } - if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) { - if (!pci_enable_msi(h->pdev)) - h->msi_vector = 1; - else - dev_warn(&h->pdev->dev, "MSI init failed\n"); + ret = pci_alloc_irq_vectors(h->pdev, 4, 4, PCI_IRQ_MSIX); + if (ret >= 0) { + h->intr[0] = pci_irq_vector(h->pdev, 0); + h->intr[1] = pci_irq_vector(h->pdev, 1); + h->intr[2] = pci_irq_vector(h->pdev, 2); + h->intr[3] = pci_irq_vector(h->pdev, 3); + return; } + + ret = pci_alloc_irq_vectors(h->pdev, 1, 1, PCI_IRQ_MSI); + default_int_mode: -#endif /* CONFIG_PCI_MSI */ /* if we get here we're going to use the default interrupt mode */ - h->intr[h->intr_mode] = h->pdev->irq; + h->intr[h->intr_mode] = pci_irq_vector(h->pdev, 0); return; } @@ -4888,7 +4874,7 @@ static int cciss_request_irq(ctlr_info_t *h, irqreturn_t (*msixhandler)(int, void *), irqreturn_t (*intxhandler)(int, void *)) { - if (h->msix_vector || h->msi_vector) { + if (h->pdev->msi_enabled || h->pdev->msix_enabled) { if (!request_irq(h->intr[h->intr_mode], msixhandler, 0, h->devname, h)) return 0; @@ -4934,12 +4920,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) int ctlr = h->ctlr; free_irq(h->intr[h->intr_mode], h); -#ifdef CONFIG_PCI_MSI - if (h->msix_vector) - pci_disable_msix(h->pdev); - else if (h->msi_vector) - pci_disable_msi(h->pdev); -#endif /* CONFIG_PCI_MSI */ + pci_free_irq_vectors(h->pdev); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); cciss_free_scatterlists(h); cciss_free_cmd_pool(h); @@ -5295,12 +5276,7 @@ static void cciss_remove_one(struct pci_dev *pdev) cciss_shutdown(pdev); -#ifdef CONFIG_PCI_MSI - if (h->msix_vector) - pci_disable_msix(h->pdev); - else if (h->msi_vector) - pci_disable_msi(h->pdev); -#endif /* CONFIG_PCI_MSI */ + pci_free_irq_vectors(h->pdev); iounmap(h->transtable); iounmap(h->cfgtable); diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 7fda30e4a241..4affa94ca17b 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -90,8 +90,6 @@ struct ctlr_info # define SIMPLE_MODE_INT 2 # define MEMQ_MODE_INT 3 unsigned int intr[4]; - unsigned int msix_vector; - unsigned int msi_vector; int intr_mode; int cciss_max_sectors; BYTE cciss_read; @@ -333,7 +331,7 @@ static unsigned long SA5_performant_completed(ctlr_info_t *h) */ register_value = readl(h->vaddr + SA5_OUTDB_STATUS); /* msi auto clears the interrupt pending bit. */ - if (!(h->msi_vector || h->msix_vector)) { + if (!(h->pdev->msi_enabled || h->pdev->msix_enabled)) { writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR); /* Do a read in order to flush the write to the controller * (as per spec.) @@ -393,7 +391,7 @@ static bool SA5_performant_intr_pending(ctlr_info_t *h) if (!register_value) return false; - if (h->msi_vector || h->msix_vector) + if (h->pdev->msi_enabled || h->pdev->msix_enabled) return true; /* Read outbound doorbell to flush */ diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index a391a3cfb3fe..184887af4b9f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3119,7 +3119,7 @@ static int raw_cmd_copyin(int cmd, void __user *param, *rcmd = NULL; loop: - ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); + ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL); if (!ptr) return -ENOMEM; *rcmd = ptr; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f347285c67ec..304377182c1a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1097,9 +1097,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; + /* I/O need to be drained during transfer transition */ + blk_mq_freeze_queue(lo->lo_queue); + err = loop_release_xfer(lo); if (err) - return err; + goto exit; if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; @@ -1114,12 +1117,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) err = loop_init_xfer(lo, xfer, info); if (err) - return err; + goto exit; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) - return -EFBIG; + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { + err = -EFBIG; + goto exit; + } loop_config_discard(lo); @@ -1156,7 +1161,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); - return 0; + exit: + blk_mq_unfreeze_queue(lo->lo_queue); + return err; } static int diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index c0e14e54909b..a67b7ea1e3bf 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -420,7 +420,8 @@ static void null_lnvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - nvm_end_io(rqd, error); + rqd->error = error; + nvm_end_io(rqd); blk_put_request(rq); } @@ -460,7 +461,6 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ver_id = 0x1; id->vmnt = 0; - id->cgrps = 1; id->cap = 0x2; id->dom = 0x1; @@ -479,7 +479,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) sector_div(size, bs); /* convert size to pages */ size >>= 8; /* concert size to pgs pr blk */ - grp = &id->groups[0]; + grp = &id->grp; grp->mtype = 0; grp->fmtype = 0; grp->num_ch = 1; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 5fd2d0e25567..10aed84244f5 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -273,7 +273,7 @@ static const struct block_device_operations pcd_bdops = { .check_events = pcd_block_check_events, }; -static struct cdrom_device_ops pcd_dops = { +static const struct cdrom_device_ops pcd_dops = { .open = pcd_open, .release = pcd_release, .drive_status = pcd_drive_status, diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 59cca72647a6..bbbd3caa927c 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -342,8 +342,8 @@ static void cdrom_sysctl_register(void); static LIST_HEAD(cdrom_list); -static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, - struct packet_command *cgc) +int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc) { if (cgc->sense) { cgc->sense->sense_key = 0x05; @@ -354,6 +354,7 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, cgc->stat = -EIO; return -EIO; } +EXPORT_SYMBOL(cdrom_dummy_generic_packet); static int cdrom_flush_cache(struct cdrom_device_info *cdi) { @@ -371,7 +372,7 @@ static int cdrom_flush_cache(struct cdrom_device_info *cdi) static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; int ret, buflen; @@ -586,7 +587,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) int register_cdrom(struct cdrom_device_info *cdi) { static char banner_printed; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int *change_capability = (int *)&cdo->capability; /* hack */ cd_dbg(CD_OPEN, "entering register_cdrom\n"); @@ -610,7 +611,6 @@ int register_cdrom(struct cdrom_device_info *cdi) ENSURE(reset, CDC_RESET); ENSURE(generic_packet, CDC_GENERIC_PACKET); cdi->mc_flags = 0; - cdo->n_minors = 0; cdi->options = CDO_USE_FFLAGS; if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY)) @@ -630,8 +630,7 @@ int register_cdrom(struct cdrom_device_info *cdi) else cdi->cdda_method = CDDA_OLD; - if (!cdo->generic_packet) - cdo->generic_packet = cdrom_dummy_generic_packet; + WARN_ON(!cdo->generic_packet); cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); mutex_lock(&cdrom_mutex); @@ -652,7 +651,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi) if (cdi->exit) cdi->exit(cdi); - cdi->ops->n_minors--; cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); } @@ -1036,7 +1034,7 @@ static int open_for_data(struct cdrom_device_info *cdi) { int ret; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; tracktype tracks; cd_dbg(CD_OPEN, "entering open_for_data\n"); /* Check if the driver can report drive status. If it can, we @@ -1198,8 +1196,8 @@ err: /* This code is similar to that in open_for_data. The routine is called whenever an audio play operation is requested. */ -static int check_for_audio_disc(struct cdrom_device_info * cdi, - struct cdrom_device_ops * cdo) +static int check_for_audio_disc(struct cdrom_device_info *cdi, + const struct cdrom_device_ops *cdo) { int ret; tracktype tracks; @@ -1254,7 +1252,7 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi, void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int opened_for_data; cd_dbg(CD_CLOSE, "entering cdrom_release\n"); @@ -1294,7 +1292,7 @@ static int cdrom_read_mech_status(struct cdrom_device_info *cdi, struct cdrom_changer_info *buf) { struct packet_command cgc; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int length; /* @@ -1643,7 +1641,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) int ret; u_char buf[20]; struct packet_command cgc; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; rpc_state_t rpc_state; memset(buf, 0, sizeof(buf)); @@ -1791,7 +1789,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s, { unsigned char buf[21], *base; struct dvd_layer *layer; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int ret, layer_num = s->physical.layer_num; if (layer_num >= DVD_LAYERS) @@ -1842,7 +1840,7 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s, { int ret; u_char buf[8]; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ); cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE; @@ -1866,7 +1864,7 @@ static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s, { int ret, size; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; size = sizeof(s->disckey.value) + 4; @@ -1894,7 +1892,7 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s, { int ret, size = 4 + 188; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; buf = kmalloc(size, GFP_KERNEL); if (!buf) @@ -1928,7 +1926,7 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s, { int ret = 0, size; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; size = sizeof(s->manufact.value) + 4; @@ -1995,7 +1993,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi, struct packet_command *cgc, int page_code, int page_control) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(cgc->cmd, 0, sizeof(cgc->cmd)); @@ -2010,7 +2008,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi, int cdrom_mode_select(struct cdrom_device_info *cdi, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(cgc->cmd, 0, sizeof(cgc->cmd)); memset(cgc->buffer, 0, 2); @@ -2025,7 +2023,7 @@ int cdrom_mode_select(struct cdrom_device_info *cdi, static int cdrom_read_subchannel(struct cdrom_device_info *cdi, struct cdrom_subchnl *subchnl, int mcn) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; char buffer[32]; int ret; @@ -2073,7 +2071,7 @@ static int cdrom_read_cd(struct cdrom_device_info *cdi, struct packet_command *cgc, int lba, int blocksize, int nblocks) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(&cgc->cmd, 0, sizeof(cgc->cmd)); cgc->cmd[0] = GPCMD_READ_10; @@ -2093,7 +2091,7 @@ static int cdrom_read_block(struct cdrom_device_info *cdi, struct packet_command *cgc, int lba, int nblocks, int format, int blksize) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(&cgc->cmd, 0, sizeof(cgc->cmd)); cgc->cmd[0] = GPCMD_READ_CD; @@ -2764,7 +2762,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi, */ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; struct modesel_head mh; @@ -2790,7 +2788,7 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size) static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type, track_information *ti) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; int ret, buflen; @@ -3049,7 +3047,7 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi, void __user *arg, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_msf msf; cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf))) @@ -3069,7 +3067,7 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi, void __user *arg, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_blk blk; cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n"); if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk))) @@ -3164,7 +3162,7 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi, struct packet_command *cgc, int cmd) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); cgc->cmd[0] = GPCMD_START_STOP_UNIT; cgc->cmd[1] = 1; @@ -3177,7 +3175,7 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi, struct packet_command *cgc, int cmd) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); cgc->cmd[0] = GPCMD_PAUSE_RESUME; cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 584bc3126403..1afab6558d0c 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, return -EINVAL; } -static struct cdrom_device_ops gdrom_ops = { +static const struct cdrom_device_ops gdrom_ops = { .open = gdrom_open, .release = gdrom_release, .drive_status = gdrom_drivestatus, @@ -489,9 +489,9 @@ static struct cdrom_device_ops gdrom_ops = { .get_last_session = gdrom_get_last_session, .reset = gdrom_hardreset, .audio_ioctl = gdrom_audio_ioctl, + .generic_packet = cdrom_dummy_generic_packet, .capability = CDC_MULTI_SESSION | CDC_MEDIA_CHANGED | CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R, - .n_minors = 1, }; static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) @@ -807,16 +807,20 @@ static int probe_gdrom(struct platform_device *devptr) if (err) goto probe_fail_cmdirq_register; gd.gdrom_rq = blk_init_queue(gdrom_request, &gdrom_lock); - if (!gd.gdrom_rq) + if (!gd.gdrom_rq) { + err = -ENOMEM; goto probe_fail_requestq; + } err = probe_gdrom_setupqueue(); if (err) goto probe_fail_toc; gd.toc = kzalloc(sizeof(struct gdromtoc), GFP_KERNEL); - if (!gd.toc) + if (!gd.toc) { + err = -ENOMEM; goto probe_fail_toc; + } add_disk(gd.disk); return 0; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 9cbd217bc0c9..ab9232e1e16f 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1166,7 +1166,7 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf) CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_GENERIC_PACKET | \ CDC_MO_DRIVE | CDC_MRW | CDC_MRW_W | CDC_RAM) -static struct cdrom_device_ops ide_cdrom_dops = { +static const struct cdrom_device_ops ide_cdrom_dops = { .open = ide_cdrom_open_real, .release = ide_cdrom_release_real, .drive_status = ide_cdrom_drive_status, diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 2f5d5f4a4c75..052714106b7b 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -26,15 +26,6 @@ config NVM_DEBUG It is required to create/remove targets without IOCTLs. -config NVM_GENNVM - tristate "General Non-Volatile Memory Manager for Open-Channel SSDs" - ---help--- - Non-volatile memory media manager for Open-Channel SSDs that implements - physical media metadata management and block provisioning API. - - This is the standard media manager for using Open-Channel SSDs, and - required for targets to be instantiated. - config NVM_RRPC tristate "Round-robin Hybrid Open-Channel SSD target" ---help--- diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index a7a0a22cf1a5..b2a39e2d2895 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,5 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o sysblk.o -obj-$(CONFIG_NVM_GENNVM) += gennvm.o +obj-$(CONFIG_NVM) := core.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 02240a0b39c9..5262ba66a7a7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -29,10 +29,483 @@ static LIST_HEAD(nvm_tgt_types); static DECLARE_RWSEM(nvm_tgtt_lock); -static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); static DECLARE_RWSEM(nvm_lock); +/* Map between virtual and physical channel and lun */ +struct nvm_ch_map { + int ch_off; + int nr_luns; + int *lun_offs; +}; + +struct nvm_dev_map { + struct nvm_ch_map *chnls; + int nr_chnls; +}; + +struct nvm_area { + struct list_head list; + sector_t begin; + sector_t end; /* end is excluded */ +}; + +static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) +{ + struct nvm_target *tgt; + + list_for_each_entry(tgt, &dev->targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + +static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) { + if (test_and_set_bit(i, dev->lun_map)) { + pr_err("nvm: lun %d already allocated\n", i); + goto err; + } + } + + return 0; +err: + while (--i > lun_begin) + clear_bit(i, dev->lun_map); + + return -EBUSY; +} + +static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, + int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) + WARN_ON(!test_and_clear_bit(i, dev->lun_map)); +} + +static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_map = tgt_dev->map; + int i, j; + + for (i = 0; i < dev_map->nr_chnls; i++) { + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs = ch_map->lun_offs; + int ch = i + ch_map->ch_off; + + for (j = 0; j < ch_map->nr_luns; j++) { + int lun = j + lun_offs[j]; + int lunid = (ch * dev->geo.luns_per_chnl) + lun; + + WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); + } + + kfree(ch_map->lun_offs); + } + + kfree(dev_map->chnls); + kfree(dev_map); + + kfree(tgt_dev->luns); + kfree(tgt_dev); +} + +static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, + int lun_begin, int lun_end) +{ + struct nvm_tgt_dev *tgt_dev = NULL; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_dev_map *dev_map; + struct ppa_addr *luns; + int nr_luns = lun_end - lun_begin + 1; + int luns_left = nr_luns; + int nr_chnls = nr_luns / dev->geo.luns_per_chnl; + int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; + int bch = lun_begin / dev->geo.luns_per_chnl; + int blun = lun_begin % dev->geo.luns_per_chnl; + int lunid = 0; + int lun_balanced = 1; + int prev_nr_luns; + int i, j; + + nr_chnls = nr_luns / dev->geo.luns_per_chnl; + nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; + + dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!dev_map) + goto err_dev; + + dev_map->chnls = kcalloc(nr_chnls, sizeof(struct nvm_ch_map), + GFP_KERNEL); + if (!dev_map->chnls) + goto err_chnls; + + luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL); + if (!luns) + goto err_luns; + + prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? + dev->geo.luns_per_chnl : luns_left; + for (i = 0; i < nr_chnls; i++) { + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; + int *lun_roffs = ch_rmap->lun_offs; + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs; + int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? + dev->geo.luns_per_chnl : luns_left; + + if (lun_balanced && prev_nr_luns != luns_in_chnl) + lun_balanced = 0; + + ch_map->ch_off = ch_rmap->ch_off = bch; + ch_map->nr_luns = luns_in_chnl; + + lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_offs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) { + luns[lunid].ppa = 0; + luns[lunid].g.ch = i; + luns[lunid++].g.lun = j; + + lun_offs[j] = blun; + lun_roffs[j + blun] = blun; + } + + ch_map->lun_offs = lun_offs; + + /* when starting a new channel, lun offset is reset */ + blun = 0; + luns_left -= luns_in_chnl; + } + + dev_map->nr_chnls = nr_chnls; + + tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); + if (!tgt_dev) + goto err_ch; + + memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); + /* Target device only owns a portion of the physical device */ + tgt_dev->geo.nr_chnls = nr_chnls; + tgt_dev->geo.nr_luns = nr_luns; + tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; + tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; + tgt_dev->q = dev->q; + tgt_dev->map = dev_map; + tgt_dev->luns = luns; + memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id)); + + tgt_dev->parent = dev; + + return tgt_dev; +err_ch: + while (--i > 0) + kfree(dev_map->chnls[i].lun_offs); + kfree(luns); +err_luns: + kfree(dev_map->chnls); +err_chnls: + kfree(dev_map); +err_dev: + return tgt_dev; +} + +static const struct block_device_operations nvm_fops = { + .owner = THIS_MODULE, +}; + +static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) +{ + struct nvm_ioctl_create_simple *s = &create->conf.s; + struct request_queue *tqueue; + struct gendisk *tdisk; + struct nvm_tgt_type *tt; + struct nvm_target *t; + struct nvm_tgt_dev *tgt_dev; + void *targetdata; + + tt = nvm_find_target_type(create->tgttype, 1); + if (!tt) { + pr_err("nvm: target type %s not found\n", create->tgttype); + return -EINVAL; + } + + mutex_lock(&dev->mlock); + t = nvm_find_target(dev, create->tgtname); + if (t) { + pr_err("nvm: target name already exists.\n"); + mutex_unlock(&dev->mlock); + return -EINVAL; + } + mutex_unlock(&dev->mlock); + + if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end)) + return -ENOMEM; + + t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); + if (!t) + goto err_reserve; + + tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); + if (!tgt_dev) { + pr_err("nvm: could not create target device\n"); + goto err_t; + } + + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + if (!tqueue) + goto err_dev; + blk_queue_make_request(tqueue, tt->make_rq); + + tdisk = alloc_disk(0); + if (!tdisk) + goto err_queue; + + sprintf(tdisk->disk_name, "%s", create->tgtname); + tdisk->flags = GENHD_FL_EXT_DEVT; + tdisk->major = 0; + tdisk->first_minor = 0; + tdisk->fops = &nvm_fops; + tdisk->queue = tqueue; + + targetdata = tt->init(tgt_dev, tdisk); + if (IS_ERR(targetdata)) + goto err_init; + + tdisk->private_data = targetdata; + tqueue->queuedata = targetdata; + + blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); + + set_capacity(tdisk, tt->capacity(targetdata)); + add_disk(tdisk); + + if (tt->sysfs_init && tt->sysfs_init(tdisk)) + goto err_sysfs; + + t->type = tt; + t->disk = tdisk; + t->dev = tgt_dev; + + mutex_lock(&dev->mlock); + list_add_tail(&t->list, &dev->targets); + mutex_unlock(&dev->mlock); + + return 0; +err_sysfs: + if (tt->exit) + tt->exit(targetdata); +err_init: + put_disk(tdisk); +err_queue: + blk_cleanup_queue(tqueue); +err_dev: + nvm_remove_tgt_dev(tgt_dev); +err_t: + kfree(t); +err_reserve: + nvm_release_luns_err(dev, s->lun_begin, s->lun_end); + return -ENOMEM; +} + +static void __nvm_remove_target(struct nvm_target *t) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->sysfs_exit) + tt->sysfs_exit(tdisk); + + if (tt->exit) + tt->exit(tdisk->private_data); + + nvm_remove_tgt_dev(t->dev); + put_disk(tdisk); + + list_del(&t->list); + kfree(t); +} + +/** + * nvm_remove_tgt - Removes a target from the media manager + * @dev: device + * @remove: ioctl structure with target name to remove. + * + * Returns: + * 0: on success + * 1: on not found + * <0: on error + */ +static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) +{ + struct nvm_target *t; + + mutex_lock(&dev->mlock); + t = nvm_find_target(dev, remove->tgtname); + if (!t) { + mutex_unlock(&dev->mlock); + return 1; + } + __nvm_remove_target(t); + mutex_unlock(&dev->mlock); + + return 0; +} + +static int nvm_register_map(struct nvm_dev *dev) +{ + struct nvm_dev_map *rmap; + int i, j; + + rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!rmap) + goto err_rmap; + + rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct nvm_ch_map), + GFP_KERNEL); + if (!rmap->chnls) + goto err_chnls; + + for (i = 0; i < dev->geo.nr_chnls; i++) { + struct nvm_ch_map *ch_rmap; + int *lun_roffs; + int luns_in_chnl = dev->geo.luns_per_chnl; + + ch_rmap = &rmap->chnls[i]; + + ch_rmap->ch_off = -1; + ch_rmap->nr_luns = luns_in_chnl; + + lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_roffs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) + lun_roffs[j] = -1; + + ch_rmap->lun_offs = lun_roffs; + } + + dev->rmap = rmap; + + return 0; +err_ch: + while (--i >= 0) + kfree(rmap->chnls[i].lun_offs); +err_chnls: + kfree(rmap); +err_rmap: + return -ENOMEM; +} + +static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev_map *dev_map = tgt_dev->map; + struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch]; + int lun_off = ch_map->lun_offs[p->g.lun]; + + p->g.ch += ch_map->ch_off; + p->g.lun += lun_off; +} + +static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch]; + int lun_roff = ch_rmap->lun_offs[p->g.lun]; + + p->g.ch -= ch_rmap->ch_off; + p->g.lun -= lun_roff; +} + +static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) +{ + int i; + + for (i = 0; i < nr_ppas; i++) { + nvm_map_to_dev(tgt_dev, &ppa_list[i]); + ppa_list[i] = generic_to_dev_addr(tgt_dev, ppa_list[i]); + } +} + +static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) +{ + int i; + + for (i = 0; i < nr_ppas; i++) { + ppa_list[i] = dev_to_generic_addr(tgt_dev, ppa_list[i]); + nvm_map_to_tgt(tgt_dev, &ppa_list[i]); + } +} + +static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + if (rqd->nr_ppas == 1) { + nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1); + return; + } + + nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas); +} + +static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + if (rqd->nr_ppas == 1) { + nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1); + return; + } + + nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); +} + +void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, + int len) +{ + struct nvm_geo *geo = &dev->geo; + struct nvm_dev_map *dev_rmap = dev->rmap; + u64 i; + + for (i = 0; i < len; i++) { + struct nvm_ch_map *ch_rmap; + int *lun_roffs; + struct ppa_addr gaddr; + u64 pba = le64_to_cpu(entries[i]); + int off; + u64 diff; + + if (!pba) + continue; + + gaddr = linear_to_generic_addr(geo, pba); + ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; + lun_roffs = ch_rmap->lun_offs; + + off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun; + + diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + + (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; + + entries[i] -= cpu_to_le64(diff); + } +} +EXPORT_SYMBOL(nvm_part_to_tgt); + struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) { struct nvm_tgt_type *tmp, *tt = NULL; @@ -92,78 +565,6 @@ void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) } EXPORT_SYMBOL(nvm_dev_dma_free); -static struct nvmm_type *nvm_find_mgr_type(const char *name) -{ - struct nvmm_type *mt; - - list_for_each_entry(mt, &nvm_mgrs, list) - if (!strcmp(name, mt->name)) - return mt; - - return NULL; -} - -static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) -{ - struct nvmm_type *mt; - int ret; - - lockdep_assert_held(&nvm_lock); - - list_for_each_entry(mt, &nvm_mgrs, list) { - if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN)) - continue; - - ret = mt->register_mgr(dev); - if (ret < 0) { - pr_err("nvm: media mgr failed to init (%d) on dev %s\n", - ret, dev->name); - return NULL; /* initialization failed */ - } else if (ret > 0) - return mt; - } - - return NULL; -} - -int nvm_register_mgr(struct nvmm_type *mt) -{ - struct nvm_dev *dev; - int ret = 0; - - down_write(&nvm_lock); - if (nvm_find_mgr_type(mt->name)) { - ret = -EEXIST; - goto finish; - } else { - list_add(&mt->list, &nvm_mgrs); - } - - /* try to register media mgr if any device have none configured */ - list_for_each_entry(dev, &nvm_devices, devices) { - if (dev->mt) - continue; - - dev->mt = nvm_init_mgr(dev); - } -finish: - up_write(&nvm_lock); - - return ret; -} -EXPORT_SYMBOL(nvm_register_mgr); - -void nvm_unregister_mgr(struct nvmm_type *mt) -{ - if (!mt) - return; - - down_write(&nvm_lock); - list_del(&mt->list); - up_write(&nvm_lock); -} -EXPORT_SYMBOL(nvm_unregister_mgr); - static struct nvm_dev *nvm_find_nvm_dev(const char *name) { struct nvm_dev *dev; @@ -175,53 +576,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } -static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev, - struct nvm_rq *rqd) -{ - struct nvm_dev *dev = tgt_dev->parent; - int i; - - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) { - rqd->ppa_list[i] = dev->mt->trans_ppa(tgt_dev, - rqd->ppa_list[i], TRANS_TGT_TO_DEV); - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } - } else { - rqd->ppa_addr = dev->mt->trans_ppa(tgt_dev, rqd->ppa_addr, - TRANS_TGT_TO_DEV); - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); - } -} - -int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, - int type) -{ - struct nvm_rq rqd; - int ret; - - if (nr_ppas > dev->ops->max_phys_sect) { - pr_err("nvm: unable to update all sysblocks atomically\n"); - return -EINVAL; - } - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - nvm_generic_to_addr_mode(dev, &rqd); - - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); - nvm_free_rqd_ppalist(dev, &rqd); - if (ret) { - pr_err("nvm: sysblk failed bb mark\n"); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL(nvm_set_bb_tbl); - int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas, int type) { @@ -237,12 +591,12 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, memset(&rqd, 0, sizeof(struct nvm_rq)); nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - nvm_tgt_generic_to_addr_mode(tgt_dev, &rqd); + nvm_rq_tgt_to_dev(tgt_dev, &rqd); ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); nvm_free_rqd_ppalist(dev, &rqd); if (ret) { - pr_err("nvm: sysblk failed bb mark\n"); + pr_err("nvm: failed bb mark\n"); return -EINVAL; } @@ -262,15 +616,42 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { struct nvm_dev *dev = tgt_dev->parent; - return dev->mt->submit_io(tgt_dev, rqd); + if (!dev->ops->submit_io) + return -ENODEV; + + nvm_rq_tgt_to_dev(tgt_dev, rqd); + + rqd->dev = tgt_dev; + return dev->ops->submit_io(dev, rqd); } EXPORT_SYMBOL(nvm_submit_io); -int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags) +int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags) { struct nvm_dev *dev = tgt_dev->parent; + struct nvm_rq rqd; + int ret; + + if (!dev->ops->erase_block) + return 0; + + nvm_map_to_dev(tgt_dev, ppas); + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1); + if (ret) + return ret; + + nvm_rq_tgt_to_dev(tgt_dev, &rqd); + + rqd.flags = flags; + + ret = dev->ops->erase_block(dev, &rqd); - return dev->mt->erase_blk(tgt_dev, p, flags); + nvm_free_rqd_ppalist(dev, &rqd); + + return ret; } EXPORT_SYMBOL(nvm_erase_blk); @@ -289,46 +670,67 @@ EXPORT_SYMBOL(nvm_get_l2p_tbl); int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len) { struct nvm_dev *dev = tgt_dev->parent; + struct nvm_geo *geo = &dev->geo; + struct nvm_area *area, *prev, *next; + sector_t begin = 0; + sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; - return dev->mt->get_area(dev, lba, len); -} -EXPORT_SYMBOL(nvm_get_area); + if (len > max_sectors) + return -EINVAL; -void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t lba) -{ - struct nvm_dev *dev = tgt_dev->parent; + area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL); + if (!area) + return -ENOMEM; - dev->mt->put_area(dev, lba); -} -EXPORT_SYMBOL(nvm_put_area); + prev = NULL; -void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; + spin_lock(&dev->lock); + list_for_each_entry(next, &dev->area_list, list) { + if (begin + len > next->begin) { + begin = next->end; + prev = next; + continue; + } + break; + } - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) - rqd->ppa_list[i] = dev_to_generic_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); + if ((begin + len) > max_sectors) { + spin_unlock(&dev->lock); + kfree(area); + return -EINVAL; } + + area->begin = *lba = begin; + area->end = begin + len; + + if (prev) /* insert into sorted order */ + list_add(&area->list, &prev->list); + else + list_add(&area->list, &dev->area_list); + spin_unlock(&dev->lock); + + return 0; } -EXPORT_SYMBOL(nvm_addr_to_generic_mode); +EXPORT_SYMBOL(nvm_get_area); -void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) +void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) { - int i; + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_area *area; - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); + spin_lock(&dev->lock); + list_for_each_entry(area, &dev->area_list, list) { + if (area->begin != begin) + continue; + + list_del(&area->list); + spin_unlock(&dev->lock); + kfree(area); + return; } + spin_unlock(&dev->lock); } -EXPORT_SYMBOL(nvm_generic_to_addr_mode); +EXPORT_SYMBOL(nvm_put_area); int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, const struct ppa_addr *ppas, int nr_ppas, int vblk) @@ -380,149 +782,19 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_free_rqd_ppalist); -int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, - int flags) +void nvm_end_io(struct nvm_rq *rqd) { - struct nvm_rq rqd; - int ret; + struct nvm_tgt_dev *tgt_dev = rqd->dev; - if (!dev->ops->erase_block) - return 0; + /* Convert address space */ + if (tgt_dev) + nvm_rq_dev_to_tgt(tgt_dev, rqd); - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - if (ret) - return ret; - - nvm_generic_to_addr_mode(dev, &rqd); - - rqd.flags = flags; - - ret = dev->ops->erase_block(dev, &rqd); - - nvm_free_rqd_ppalist(dev, &rqd); - - return ret; -} -EXPORT_SYMBOL(nvm_erase_ppa); - -void nvm_end_io(struct nvm_rq *rqd, int error) -{ - rqd->error = error; - rqd->end_io(rqd); + if (rqd->end_io) + rqd->end_io(rqd); } EXPORT_SYMBOL(nvm_end_io); -static void nvm_end_io_sync(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->wait; - - rqd->wait = NULL; - - complete(waiting); -} - -static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, - int flags, void *buf, int len) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct bio *bio; - int ret; - unsigned long hang_check; - - bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL); - if (IS_ERR_OR_NULL(bio)) - return -ENOMEM; - - nvm_generic_to_addr_mode(dev, rqd); - - rqd->dev = NULL; - rqd->opcode = opcode; - rqd->flags = flags; - rqd->bio = bio; - rqd->wait = &wait; - rqd->end_io = nvm_end_io_sync; - - ret = dev->ops->submit_io(dev, rqd); - if (ret) { - bio_put(bio); - return ret; - } - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&wait, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&wait); - - return rqd->error; -} - -/** - * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must - * take to free ppa list if necessary. - * @dev: device - * @ppa_list: user created ppa_list - * @nr_ppas: length of ppa_list - * @opcode: device opcode - * @flags: device flags - * @buf: data buffer - * @len: data buffer length - */ -int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list, - int nr_ppas, int opcode, int flags, void *buf, int len) -{ - struct nvm_rq rqd; - - if (dev->ops->max_phys_sect < nr_ppas) - return -EINVAL; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rqd.nr_ppas = nr_ppas; - if (nr_ppas > 1) - rqd.ppa_list = ppa_list; - else - rqd.ppa_addr = ppa_list[0]; - - return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); -} -EXPORT_SYMBOL(nvm_submit_ppa_list); - -/** - * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded - * as single, dual, quad plane PPAs depending on device type. - * @dev: device - * @ppa: user created ppa_list - * @nr_ppas: length of ppa_list - * @opcode: device opcode - * @flags: device flags - * @buf: data buffer - * @len: data buffer length - */ -int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, - int opcode, int flags, void *buf, int len) -{ - struct nvm_rq rqd; - int ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1); - if (ret) - return ret; - - ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); - - nvm_free_rqd_ppalist(dev, &rqd); - - return ret; -} -EXPORT_SYMBOL(nvm_submit_ppa); - /* * folds a bad block list from its plane representation to its virtual * block representation. The fold is done in place and reduced size is @@ -559,21 +831,14 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) } EXPORT_SYMBOL(nvm_bb_tbl_fold); -int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks) -{ - ppa = generic_to_dev_addr(dev, ppa); - - return dev->ops->get_bb_tbl(dev, ppa, blks); -} -EXPORT_SYMBOL(nvm_get_bb_tbl); - int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, u8 *blks) { struct nvm_dev *dev = tgt_dev->parent; - ppa = dev->mt->trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV); - return nvm_get_bb_tbl(dev, ppa, blks); + nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); + + return dev->ops->get_bb_tbl(dev, ppa, blks); } EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); @@ -627,7 +892,7 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; - struct nvm_id_group *grp = &id->groups[0]; + struct nvm_id_group *grp = &id->grp; struct nvm_geo *geo = &dev->geo; int ret; @@ -691,36 +956,31 @@ static int nvm_core_init(struct nvm_dev *dev) goto err_fmtype; } + INIT_LIST_HEAD(&dev->area_list); + INIT_LIST_HEAD(&dev->targets); mutex_init(&dev->mlock); spin_lock_init(&dev->lock); - blk_queue_logical_block_size(dev->q, geo->sec_size); + ret = nvm_register_map(dev); + if (ret) + goto err_fmtype; + blk_queue_logical_block_size(dev->q, geo->sec_size); return 0; err_fmtype: kfree(dev->lun_map); return ret; } -static void nvm_free_mgr(struct nvm_dev *dev) -{ - if (!dev->mt) - return; - - dev->mt->unregister_mgr(dev); - dev->mt = NULL; -} - void nvm_free(struct nvm_dev *dev) { if (!dev) return; - nvm_free_mgr(dev); - if (dev->dma_pool) dev->ops->destroy_dma_pool(dev->dma_pool); + kfree(dev->rmap); kfree(dev->lptbl); kfree(dev->lun_map); kfree(dev); @@ -731,28 +991,19 @@ static int nvm_init(struct nvm_dev *dev) struct nvm_geo *geo = &dev->geo; int ret = -EINVAL; - if (!dev->q || !dev->ops) - return ret; - if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; } - pr_debug("nvm: ver:%x nvm_vendor:%x groups:%u\n", - dev->identity.ver_id, dev->identity.vmnt, - dev->identity.cgrps); + pr_debug("nvm: ver:%x nvm_vendor:%x\n", + dev->identity.ver_id, dev->identity.vmnt); if (dev->identity.ver_id != 1) { pr_err("nvm: device not supported by kernel."); goto err; } - if (dev->identity.cgrps != 1) { - pr_err("nvm: only one group configuration supported."); - goto err; - } - ret = nvm_core_init(dev); if (ret) { pr_err("nvm: could not initialize core structures.\n"); @@ -779,49 +1030,50 @@ int nvm_register(struct nvm_dev *dev) { int ret; - ret = nvm_init(dev); - if (ret) - goto err_init; + if (!dev->q || !dev->ops) + return -EINVAL; if (dev->ops->max_phys_sect > 256) { pr_info("nvm: max sectors supported is 256.\n"); - ret = -EINVAL; - goto err_init; + return -EINVAL; } if (dev->ops->max_phys_sect > 1) { dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->dma_pool) { pr_err("nvm: could not create dma pool\n"); - ret = -ENOMEM; - goto err_init; + return -ENOMEM; } } - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { - ret = nvm_get_sysblock(dev, &dev->sb); - if (!ret) - pr_err("nvm: device not initialized.\n"); - else if (ret < 0) - pr_err("nvm: err (%d) on device initialization\n", ret); - } + ret = nvm_init(dev); + if (ret) + goto err_init; /* register device with a supported media manager */ down_write(&nvm_lock); - if (ret > 0) - dev->mt = nvm_init_mgr(dev); list_add(&dev->devices, &nvm_devices); up_write(&nvm_lock); return 0; err_init: - kfree(dev->lun_map); + dev->ops->destroy_dma_pool(dev->dma_pool); return ret; } EXPORT_SYMBOL(nvm_register); void nvm_unregister(struct nvm_dev *dev) { + struct nvm_target *t, *tmp; + + mutex_lock(&dev->mlock); + list_for_each_entry_safe(t, tmp, &dev->targets, list) { + if (t->dev->parent != dev) + continue; + __nvm_remove_target(t); + } + mutex_unlock(&dev->mlock); + down_write(&nvm_lock); list_del(&dev->devices); up_write(&nvm_lock); @@ -844,24 +1096,24 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - if (!dev->mt) { - pr_info("nvm: device has no media manager registered.\n"); - return -ENODEV; - } - if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { pr_err("nvm: config type not valid\n"); return -EINVAL; } s = &create->conf.s; - if (s->lun_begin > s->lun_end || s->lun_end > dev->geo.nr_luns) { + if (s->lun_begin == -1 && s->lun_end == -1) { + s->lun_begin = 0; + s->lun_end = dev->geo.nr_luns - 1; + } + + if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) { pr_err("nvm: lun out of bound (%u:%u > %u)\n", - s->lun_begin, s->lun_end, dev->geo.nr_luns); + s->lun_begin, s->lun_end, dev->geo.nr_luns - 1); return -EINVAL; } - return dev->mt->create_tgt(dev, create); + return nvm_create_tgt(dev, create); } static long nvm_ioctl_info(struct file *file, void __user *arg) @@ -923,16 +1175,14 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg) struct nvm_ioctl_device_info *info = &devices->info[i]; sprintf(info->devname, "%s", dev->name); - if (dev->mt) { - info->bmversion[0] = dev->mt->version[0]; - info->bmversion[1] = dev->mt->version[1]; - info->bmversion[2] = dev->mt->version[2]; - sprintf(info->bmname, "%s", dev->mt->name); - } else { - sprintf(info->bmname, "none"); - } + /* kept for compatibility */ + info->bmversion[0] = 1; + info->bmversion[1] = 0; + info->bmversion[2] = 0; + sprintf(info->bmname, "%s", "gennvm"); i++; + if (i > 31) { pr_err("nvm: max 31 devices can be reported.\n"); break; @@ -994,7 +1244,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) } list_for_each_entry(dev, &nvm_devices, devices) { - ret = dev->mt->remove_tgt(dev, &remove); + ret = nvm_remove_tgt(dev, &remove); if (!ret) break; } @@ -1002,47 +1252,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return ret; } -static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) -{ - info->seqnr = 1; - info->erase_cnt = 0; - info->version = 1; -} - -static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) -{ - struct nvm_dev *dev; - struct nvm_sb_info info; - int ret; - - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(init->dev); - up_write(&nvm_lock); - if (!dev) { - pr_err("nvm: device not found\n"); - return -EINVAL; - } - - nvm_setup_nvm_sb_info(&info); - - strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); - info.fs_ppa.ppa = -1; - - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { - ret = nvm_init_sysblock(dev, &info); - if (ret) - return ret; - } - - memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info)); - - down_write(&nvm_lock); - dev->mt = nvm_init_mgr(dev); - up_write(&nvm_lock); - - return 0; -} - +/* kept for compatibility reasons */ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) { struct nvm_ioctl_dev_init init; @@ -1058,15 +1268,13 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) return -EINVAL; } - init.dev[DISK_NAME_LEN - 1] = '\0'; - - return __nvm_ioctl_dev_init(&init); + return 0; } +/* Kept for compatibility reasons */ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) { struct nvm_ioctl_dev_factory fact; - struct nvm_dev *dev; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1079,19 +1287,6 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) return -EINVAL; - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(fact.dev); - up_write(&nvm_lock); - if (!dev) { - pr_err("nvm: device not found\n"); - return -EINVAL; - } - - nvm_free_mgr(dev); - - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) - return nvm_dev_factory(dev, fact.flags); - return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c deleted file mode 100644 index ca7880082d80..000000000000 --- a/drivers/lightnvm/gennvm.c +++ /dev/null @@ -1,657 +0,0 @@ -/* - * Copyright (C) 2015 Matias Bjorling <m@bjorling.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - * Implementation of a general nvm manager for Open-Channel SSDs. - */ - -#include "gennvm.h" - -static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name) -{ - struct nvm_target *tgt; - - list_for_each_entry(tgt, &gn->targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; - - return NULL; -} - -static const struct block_device_operations gen_fops = { - .owner = THIS_MODULE, -}; - -static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t, - int lun_begin, int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) { - if (test_and_set_bit(i, dev->lun_map)) { - pr_err("nvm: lun %d already allocated\n", i); - goto err; - } - } - - return 0; - -err: - while (--i > lun_begin) - clear_bit(i, dev->lun_map); - - return -EBUSY; -} - -static void gen_release_luns_err(struct nvm_dev *dev, int lun_begin, - int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) - WARN_ON(!test_and_clear_bit(i, dev->lun_map)); -} - -static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_map = tgt_dev->map; - int i, j; - - for (i = 0; i < dev_map->nr_chnls; i++) { - struct gen_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs = ch_map->lun_offs; - int ch = i + ch_map->ch_off; - - for (j = 0; j < ch_map->nr_luns; j++) { - int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.luns_per_chnl) + lun; - - WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); - } - - kfree(ch_map->lun_offs); - } - - kfree(dev_map->chnls); - kfree(dev_map); - kfree(tgt_dev->luns); - kfree(tgt_dev); -} - -static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev, - int lun_begin, int lun_end) -{ - struct nvm_tgt_dev *tgt_dev = NULL; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_dev_map *dev_map; - struct ppa_addr *luns; - int nr_luns = lun_end - lun_begin + 1; - int luns_left = nr_luns; - int nr_chnls = nr_luns / dev->geo.luns_per_chnl; - int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; - int bch = lun_begin / dev->geo.luns_per_chnl; - int blun = lun_begin % dev->geo.luns_per_chnl; - int lunid = 0; - int lun_balanced = 1; - int prev_nr_luns; - int i, j; - - nr_chnls = nr_luns / dev->geo.luns_per_chnl; - nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; - - dev_map = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL); - if (!dev_map) - goto err_dev; - - dev_map->chnls = kcalloc(nr_chnls, sizeof(struct gen_ch_map), - GFP_KERNEL); - if (!dev_map->chnls) - goto err_chnls; - - luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL); - if (!luns) - goto err_luns; - - prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; - for (i = 0; i < nr_chnls; i++) { - struct gen_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; - int *lun_roffs = ch_rmap->lun_offs; - struct gen_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; - - if (lun_balanced && prev_nr_luns != luns_in_chnl) - lun_balanced = 0; - - ch_map->ch_off = ch_rmap->ch_off = bch; - ch_map->nr_luns = luns_in_chnl; - - lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_offs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) { - luns[lunid].ppa = 0; - luns[lunid].g.ch = i; - luns[lunid++].g.lun = j; - - lun_offs[j] = blun; - lun_roffs[j + blun] = blun; - } - - ch_map->lun_offs = lun_offs; - - /* when starting a new channel, lun offset is reset */ - blun = 0; - luns_left -= luns_in_chnl; - } - - dev_map->nr_chnls = nr_chnls; - - tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); - if (!tgt_dev) - goto err_ch; - - memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); - /* Target device only owns a portion of the physical device */ - tgt_dev->geo.nr_chnls = nr_chnls; - tgt_dev->geo.nr_luns = nr_luns; - tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; - tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; - tgt_dev->q = dev->q; - tgt_dev->map = dev_map; - tgt_dev->luns = luns; - memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id)); - - tgt_dev->parent = dev; - - return tgt_dev; -err_ch: - while (--i > 0) - kfree(dev_map->chnls[i].lun_offs); - kfree(luns); -err_luns: - kfree(dev_map->chnls); -err_chnls: - kfree(dev_map); -err_dev: - return tgt_dev; -} - -static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) -{ - struct gen_dev *gn = dev->mp; - struct nvm_ioctl_create_simple *s = &create->conf.s; - struct request_queue *tqueue; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - struct nvm_tgt_dev *tgt_dev; - void *targetdata; - - tt = nvm_find_target_type(create->tgttype, 1); - if (!tt) { - pr_err("nvm: target type %s not found\n", create->tgttype); - return -EINVAL; - } - - mutex_lock(&gn->lock); - t = gen_find_target(gn, create->tgtname); - if (t) { - pr_err("nvm: target name already exists.\n"); - mutex_unlock(&gn->lock); - return -EINVAL; - } - mutex_unlock(&gn->lock); - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) - return -ENOMEM; - - if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end)) - goto err_t; - - tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end); - if (!tgt_dev) { - pr_err("nvm: could not create target device\n"); - goto err_reserve; - } - - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); - if (!tqueue) - goto err_dev; - blk_queue_make_request(tqueue, tt->make_rq); - - tdisk = alloc_disk(0); - if (!tdisk) - goto err_queue; - - sprintf(tdisk->disk_name, "%s", create->tgtname); - tdisk->flags = GENHD_FL_EXT_DEVT; - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = &gen_fops; - tdisk->queue = tqueue; - - targetdata = tt->init(tgt_dev, tdisk); - if (IS_ERR(targetdata)) - goto err_init; - - tdisk->private_data = targetdata; - tqueue->queuedata = targetdata; - - blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - t->type = tt; - t->disk = tdisk; - t->dev = tgt_dev; - - mutex_lock(&gn->lock); - list_add_tail(&t->list, &gn->targets); - mutex_unlock(&gn->lock); - - return 0; -err_init: - put_disk(tdisk); -err_queue: - blk_cleanup_queue(tqueue); -err_dev: - kfree(tgt_dev); -err_reserve: - gen_release_luns_err(dev, s->lun_begin, s->lun_end); -err_t: - kfree(t); - return -ENOMEM; -} - -static void __gen_remove_target(struct nvm_target *t) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; - - del_gendisk(tdisk); - blk_cleanup_queue(q); - - if (tt->exit) - tt->exit(tdisk->private_data); - - gen_remove_tgt_dev(t->dev); - put_disk(tdisk); - - list_del(&t->list); - kfree(t); -} - -/** - * gen_remove_tgt - Removes a target from the media manager - * @dev: device - * @remove: ioctl structure with target name to remove. - * - * Returns: - * 0: on success - * 1: on not found - * <0: on error - */ -static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) -{ - struct gen_dev *gn = dev->mp; - struct nvm_target *t; - - if (!gn) - return 1; - - mutex_lock(&gn->lock); - t = gen_find_target(gn, remove->tgtname); - if (!t) { - mutex_unlock(&gn->lock); - return 1; - } - __gen_remove_target(t); - mutex_unlock(&gn->lock); - - return 0; -} - -static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) -{ - struct nvm_geo *geo = &dev->geo; - struct gen_dev *gn = dev->mp; - struct gen_area *area, *prev, *next; - sector_t begin = 0; - sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; - - if (len > max_sectors) - return -EINVAL; - - area = kmalloc(sizeof(struct gen_area), GFP_KERNEL); - if (!area) - return -ENOMEM; - - prev = NULL; - - spin_lock(&dev->lock); - list_for_each_entry(next, &gn->area_list, list) { - if (begin + len > next->begin) { - begin = next->end; - prev = next; - continue; - } - break; - } - - if ((begin + len) > max_sectors) { - spin_unlock(&dev->lock); - kfree(area); - return -EINVAL; - } - - area->begin = *lba = begin; - area->end = begin + len; - - if (prev) /* insert into sorted order */ - list_add(&area->list, &prev->list); - else - list_add(&area->list, &gn->area_list); - spin_unlock(&dev->lock); - - return 0; -} - -static void gen_put_area(struct nvm_dev *dev, sector_t begin) -{ - struct gen_dev *gn = dev->mp; - struct gen_area *area; - - spin_lock(&dev->lock); - list_for_each_entry(area, &gn->area_list, list) { - if (area->begin != begin) - continue; - - list_del(&area->list); - spin_unlock(&dev->lock); - kfree(area); - return; - } - spin_unlock(&dev->lock); -} - -static void gen_free(struct nvm_dev *dev) -{ - kfree(dev->mp); - kfree(dev->rmap); - dev->mp = NULL; -} - -static int gen_register(struct nvm_dev *dev) -{ - struct gen_dev *gn; - struct gen_dev_map *dev_rmap; - int i, j; - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL); - if (!gn) - goto err_gn; - - dev_rmap = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL); - if (!dev_rmap) - goto err_rmap; - - dev_rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct gen_ch_map), - GFP_KERNEL); - if (!dev_rmap->chnls) - goto err_chnls; - - for (i = 0; i < dev->geo.nr_chnls; i++) { - struct gen_ch_map *ch_rmap; - int *lun_roffs; - int luns_in_chnl = dev->geo.luns_per_chnl; - - ch_rmap = &dev_rmap->chnls[i]; - - ch_rmap->ch_off = -1; - ch_rmap->nr_luns = luns_in_chnl; - - lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_roffs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) - lun_roffs[j] = -1; - - ch_rmap->lun_offs = lun_roffs; - } - - gn->dev = dev; - gn->nr_luns = dev->geo.nr_luns; - INIT_LIST_HEAD(&gn->area_list); - mutex_init(&gn->lock); - INIT_LIST_HEAD(&gn->targets); - dev->mp = gn; - dev->rmap = dev_rmap; - - return 1; -err_ch: - while (--i >= 0) - kfree(dev_rmap->chnls[i].lun_offs); -err_chnls: - kfree(dev_rmap); -err_rmap: - gen_free(dev); -err_gn: - module_put(THIS_MODULE); - return -ENOMEM; -} - -static void gen_unregister(struct nvm_dev *dev) -{ - struct gen_dev *gn = dev->mp; - struct nvm_target *t, *tmp; - - mutex_lock(&gn->lock); - list_for_each_entry_safe(t, tmp, &gn->targets, list) { - if (t->dev->parent != dev) - continue; - __gen_remove_target(t); - } - mutex_unlock(&gn->lock); - - gen_free(dev); - module_put(THIS_MODULE); -} - -static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct gen_dev_map *dev_map = tgt_dev->map; - struct gen_ch_map *ch_map = &dev_map->chnls[p->g.ch]; - int lun_off = ch_map->lun_offs[p->g.lun]; - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_ch_map *ch_rmap; - int lun_roff; - - p->g.ch += ch_map->ch_off; - p->g.lun += lun_off; - - ch_rmap = &dev_rmap->chnls[p->g.ch]; - lun_roff = ch_rmap->lun_offs[p->g.lun]; - - if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) { - pr_err("nvm: corrupted device partition table\n"); - return -EINVAL; - } - - return 0; -} - -static int gen_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch]; - int lun_roff = ch_rmap->lun_offs[p->g.lun]; - - p->g.ch -= ch_rmap->ch_off; - p->g.lun -= lun_roff; - - return 0; -} - -static int gen_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - int flag) -{ - gen_trans_fn *f; - int i; - int ret = 0; - - f = (flag == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt; - - if (rqd->nr_ppas == 1) - return f(tgt_dev, &rqd->ppa_addr); - - for (i = 0; i < rqd->nr_ppas; i++) { - ret = f(tgt_dev, &rqd->ppa_list[i]); - if (ret) - goto out; - } - -out: - return ret; -} - -static void gen_end_io(struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *tgt_dev = rqd->dev; - struct nvm_tgt_instance *ins = rqd->ins; - - /* Convert address space */ - if (tgt_dev) - gen_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT); - - ins->tt->end_io(rqd); -} - -static int gen_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct nvm_dev *dev = tgt_dev->parent; - - if (!dev->ops->submit_io) - return -ENODEV; - - /* Convert address space */ - gen_trans_rq(tgt_dev, rqd, TRANS_TGT_TO_DEV); - nvm_generic_to_addr_mode(dev, rqd); - - rqd->dev = tgt_dev; - rqd->end_io = gen_end_io; - return dev->ops->submit_io(dev, rqd); -} - -static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, - int flags) -{ - /* Convert address space */ - gen_map_to_dev(tgt_dev, p); - - return nvm_erase_ppa(tgt_dev->parent, p, 1, flags); -} - -static struct ppa_addr gen_trans_ppa(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr p, int direction) -{ - gen_trans_fn *f; - struct ppa_addr ppa = p; - - f = (direction == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt; - f(tgt_dev, &ppa); - - return ppa; -} - -static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries, - int len) -{ - struct nvm_geo *geo = &dev->geo; - struct gen_dev_map *dev_rmap = dev->rmap; - u64 i; - - for (i = 0; i < len; i++) { - struct gen_ch_map *ch_rmap; - int *lun_roffs; - struct ppa_addr gaddr; - u64 pba = le64_to_cpu(entries[i]); - int off; - u64 diff; - - if (!pba) - continue; - - gaddr = linear_to_generic_addr(geo, pba); - ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; - lun_roffs = ch_rmap->lun_offs; - - off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun; - - diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + - (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; - - entries[i] -= cpu_to_le64(diff); - } -} - -static struct nvmm_type gen = { - .name = "gennvm", - .version = {0, 1, 0}, - - .register_mgr = gen_register, - .unregister_mgr = gen_unregister, - - .create_tgt = gen_create_tgt, - .remove_tgt = gen_remove_tgt, - - .submit_io = gen_submit_io, - .erase_blk = gen_erase_blk, - - .get_area = gen_get_area, - .put_area = gen_put_area, - - .trans_ppa = gen_trans_ppa, - .part_to_tgt = gen_part_to_tgt, -}; - -static int __init gen_module_init(void) -{ - return nvm_register_mgr(&gen); -} - -static void gen_module_exit(void) -{ - nvm_unregister_mgr(&gen); -} - -module_init(gen_module_init); -module_exit(gen_module_exit); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("General media manager for Open-Channel SSDs"); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h deleted file mode 100644 index 6a4b3f368848..000000000000 --- a/drivers/lightnvm/gennvm.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright: Matias Bjorling <mb@bjorling.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - */ - -#ifndef GENNVM_H_ -#define GENNVM_H_ - -#include <linux/module.h> -#include <linux/vmalloc.h> - -#include <linux/lightnvm.h> - -struct gen_dev { - struct nvm_dev *dev; - - int nr_luns; - struct list_head area_list; - - struct mutex lock; - struct list_head targets; -}; - -/* Map between virtual and physical channel and lun */ -struct gen_ch_map { - int ch_off; - int nr_luns; - int *lun_offs; -}; - -struct gen_dev_map { - struct gen_ch_map *chnls; - int nr_chnls; -}; - -struct gen_area { - struct list_head list; - sector_t begin; - sector_t end; /* end is excluded */ -}; - -static inline void *ch_map_to_lun_offs(struct gen_ch_map *ch_map) -{ - return ch_map + 1; -} - -typedef int (gen_trans_fn)(struct nvm_tgt_dev *, struct ppa_addr *); - -#define gen_for_each_lun(bm, lun, i) \ - for ((i) = 0, lun = &(bm)->luns[0]; \ - (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) - -#endif /* GENNVM_H_ */ diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 9fb7de395915..e00b1d7b976f 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -779,7 +779,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, static void rrpc_end_io(struct nvm_rq *rqd) { - struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance); + struct rrpc *rrpc = rqd->private; struct nvm_tgt_dev *dev = rrpc->dev; struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); uint8_t npages = rqd->nr_ppas; @@ -972,8 +972,9 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, bio_get(bio); rqd->bio = bio; - rqd->ins = &rrpc->instance; + rqd->private = rrpc; rqd->nr_ppas = nr_pages; + rqd->end_io = rrpc_end_io; rrq->flags = flags; err = nvm_submit_io(dev, rqd); @@ -1532,7 +1533,6 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk) if (!rrpc) return ERR_PTR(-ENOMEM); - rrpc->instance.tt = &tt_rrpc; rrpc->dev = dev; rrpc->disk = tdisk; @@ -1611,7 +1611,6 @@ static struct nvm_tgt_type tt_rrpc = { .make_rq = rrpc_make_rq, .capacity = rrpc_capacity, - .end_io = rrpc_end_io, .init = rrpc_init, .exit = rrpc_exit, diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 94e4d73116b2..fdb6ff902903 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -102,9 +102,6 @@ struct rrpc_lun { }; struct rrpc { - /* instance must be kept in top to resolve rrpc in unprep */ - struct nvm_tgt_instance instance; - struct nvm_tgt_dev *dev; struct gendisk *disk; diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c deleted file mode 100644 index 12002bf4efc2..000000000000 --- a/drivers/lightnvm/sysblk.c +++ /dev/null @@ -1,733 +0,0 @@ -/* - * Copyright (C) 2015 Matias Bjorling. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - */ - -#include <linux/lightnvm.h> - -#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */ -#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases - * enables ~1.5M updates per sysblk unit - */ - -struct sysblk_scan { - /* A row is a collection of flash blocks for a system block. */ - int nr_rows; - int row; - int act_blk[MAX_SYSBLKS]; - - int nr_ppas; - struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */ -}; - -static inline int scan_ppa_idx(int row, int blkid) -{ - return (row * MAX_BLKS_PR_SYSBLK) + blkid; -} - -static void nvm_sysblk_to_cpu(struct nvm_sb_info *info, - struct nvm_system_block *sb) -{ - info->seqnr = be32_to_cpu(sb->seqnr); - info->erase_cnt = be32_to_cpu(sb->erase_cnt); - info->version = be16_to_cpu(sb->version); - strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN); - info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); -} - -static void nvm_cpu_to_sysblk(struct nvm_system_block *sb, - struct nvm_sb_info *info) -{ - sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); - sb->seqnr = cpu_to_be32(info->seqnr); - sb->erase_cnt = cpu_to_be32(info->erase_cnt); - sb->version = cpu_to_be16(info->version); - strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN); - sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa); -} - -static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) -{ - struct nvm_geo *geo = &dev->geo; - int nr_rows = min_t(int, MAX_SYSBLKS, geo->nr_chnls); - int i; - - for (i = 0; i < nr_rows; i++) - sysblk_ppas[i].ppa = 0; - - /* if possible, place sysblk at first channel, middle channel and last - * channel of the device. If not, create only one or two sys blocks - */ - switch (geo->nr_chnls) { - case 2: - sysblk_ppas[1].g.ch = 1; - /* fall-through */ - case 1: - sysblk_ppas[0].g.ch = 0; - break; - default: - sysblk_ppas[0].g.ch = 0; - sysblk_ppas[1].g.ch = geo->nr_chnls / 2; - sysblk_ppas[2].g.ch = geo->nr_chnls - 1; - break; - } - - return nr_rows; -} - -static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, - struct ppa_addr *sysblk_ppas) -{ - memset(s, 0, sizeof(struct sysblk_scan)); - s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); -} - -static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - struct sysblk_scan *s) -{ - struct ppa_addr *sppa; - int i, blkid = 0; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - for (i = 0; i < nr_blks; i++) { - if (blks[i] == NVM_BLK_T_HOST) - return -EEXIST; - - if (blks[i] != NVM_BLK_T_FREE) - continue; - - sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; - sppa->g.ch = ppa.g.ch; - sppa->g.lun = ppa.g.lun; - sppa->g.blk = i; - s->nr_ppas++; - blkid++; - - pr_debug("nvm: use (%u %u %u) as sysblk\n", - sppa->g.ch, sppa->g.lun, sppa->g.blk); - if (blkid > MAX_BLKS_PR_SYSBLK - 1) - return 0; - } - - pr_err("nvm: sysblk failed get sysblk\n"); - return -EINVAL; -} - -static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - struct sysblk_scan *s) -{ - int i, nr_sysblk = 0; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - for (i = 0; i < nr_blks; i++) { - if (blks[i] != NVM_BLK_T_HOST) - continue; - - if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) { - pr_err("nvm: too many host blks\n"); - return -EINVAL; - } - - ppa.g.blk = i; - - s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa; - s->nr_ppas++; - nr_sysblk++; - } - - return 0; -} - -static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, - struct ppa_addr *ppas, int get_free) -{ - struct nvm_geo *geo = &dev->geo; - int i, nr_blks, ret = 0; - u8 *blks; - - s->nr_ppas = 0; - nr_blks = geo->blks_per_lun * geo->plane_mode; - - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - for (i = 0; i < s->nr_rows; i++) { - s->row = i; - - ret = nvm_get_bb_tbl(dev, ppas[i], blks); - if (ret) { - pr_err("nvm: failed bb tbl for ppa (%u %u)\n", - ppas[i].g.ch, - ppas[i].g.blk); - goto err_get; - } - - if (get_free) - ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks, - s); - else - ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks, - s); - - if (ret) - goto err_get; - } - -err_get: - kfree(blks); - return ret; -} - -/* - * scans a block for latest sysblk. - * Returns: - * 0 - newer sysblk not found. PPA is updated to latest page. - * 1 - newer sysblk found and stored in *cur. PPA is updated to - * next valid page. - * <0- error. - */ -static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, - struct nvm_system_block *sblk) -{ - struct nvm_geo *geo = &dev->geo; - struct nvm_system_block *cur; - int pg, ret, found = 0; - - /* the full buffer for a flash page is allocated. Only the first of it - * contains the system block information - */ - cur = kmalloc(geo->pfpg_size, GFP_KERNEL); - if (!cur) - return -ENOMEM; - - /* perform linear scan through the block */ - for (pg = 0; pg < dev->lps_per_blk; pg++) { - ppa->g.pg = ppa_to_slc(dev, pg); - - ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, - cur, geo->pfpg_size); - if (ret) { - if (ret == NVM_RSP_ERR_EMPTYPAGE) { - pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; - } - pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)", - ret, - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; /* if we can't read a page, continue to the - * next blk - */ - } - - if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) { - pr_debug("nvm: scan break for ppa (%u %u %u %u)\n", - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; /* last valid page already found */ - } - - if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr)) - continue; - - memcpy(sblk, cur, sizeof(struct nvm_system_block)); - found = 1; - } - - kfree(cur); - - return found; -} - -static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, - int type) -{ - return nvm_set_bb_tbl(dev, s->ppas, s->nr_ppas, type); -} - -static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, - struct sysblk_scan *s) -{ - struct nvm_geo *geo = &dev->geo; - struct nvm_system_block nvmsb; - void *buf; - int i, sect, ret = 0; - struct ppa_addr *ppas; - - nvm_cpu_to_sysblk(&nvmsb, info); - - buf = kzalloc(geo->pfpg_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); - - ppas = kcalloc(geo->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL); - if (!ppas) { - ret = -ENOMEM; - goto err; - } - - /* Write and verify */ - for (i = 0; i < s->nr_rows; i++) { - ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])]; - - pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk, - ppas[0].g.pg); - - /* Expand to all sectors within a flash page */ - if (geo->sec_per_pg > 1) { - for (sect = 1; sect < geo->sec_per_pg; sect++) { - ppas[sect].ppa = ppas[0].ppa; - ppas[sect].g.sec = sect; - } - } - - ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PWRITE, - NVM_IO_SLC_MODE, buf, geo->pfpg_size); - if (ret) { - pr_err("nvm: sysblk failed program (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - break; - } - - ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PREAD, - NVM_IO_SLC_MODE, buf, geo->pfpg_size); - if (ret) { - pr_err("nvm: sysblk failed read (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - break; - } - - if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) { - pr_err("nvm: sysblk failed verify (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - ret = -EINVAL; - break; - } - } - - kfree(ppas); -err: - kfree(buf); - - return ret; -} - -static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s) -{ - int i, ret; - unsigned long nxt_blk; - struct ppa_addr *ppa; - - for (i = 0; i < s->nr_rows; i++) { - nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK; - ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)]; - ppa->g.pg = ppa_to_slc(dev, 0); - - ret = nvm_erase_ppa(dev, ppa, 1, 0); - if (ret) - return ret; - - s->act_blk[i] = nxt_blk; - } - - return 0; -} - -int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) -{ - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - struct nvm_system_block *cur; - int i, j, found = 0; - int ret = -ENOMEM; - - /* - * 1. setup sysblk locations - * 2. get bad block list - * 3. filter on host-specific (type 3) - * 4. iterate through all and find the highest seq nr. - * 5. return superblock information - */ - - if (!dev->ops->get_bb_tbl) - return -EINVAL; - - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (ret) - goto err_sysblk; - - /* no sysblocks initialized */ - if (!s.nr_ppas) - goto err_sysblk; - - cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); - if (!cur) - goto err_sysblk; - - /* find the latest block across all sysblocks */ - for (i = 0; i < s.nr_rows; i++) { - for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { - struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)]; - - ret = nvm_scan_block(dev, &ppa, cur); - if (ret > 0) - found = 1; - else if (ret < 0) - break; - } - } - - nvm_sysblk_to_cpu(info, cur); - - kfree(cur); -err_sysblk: - mutex_unlock(&dev->mlock); - - if (found) - return 1; - return ret; -} - -int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) -{ - /* 1. for each latest superblock - * 2. if room - * a. write new flash page entry with the updated information - * 3. if no room - * a. find next available block on lun (linear search) - * if none, continue to next lun - * if none at all, report error. also report that it wasn't - * possible to write to all superblocks. - * c. write data to block. - */ - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - struct nvm_system_block *cur; - int i, j, ppaidx, found = 0; - int ret = -ENOMEM; - - if (!dev->ops->get_bb_tbl) - return -EINVAL; - - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (ret) - goto err_sysblk; - - cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); - if (!cur) - goto err_sysblk; - - /* Get the latest sysblk for each sysblk row */ - for (i = 0; i < s.nr_rows; i++) { - found = 0; - for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { - ppaidx = scan_ppa_idx(i, j); - ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur); - if (ret > 0) { - s.act_blk[i] = j; - found = 1; - } else if (ret < 0) - break; - } - } - - if (!found) { - pr_err("nvm: no valid sysblks found to update\n"); - ret = -EINVAL; - goto err_cur; - } - - /* - * All sysblocks found. Check that they have same page id in their flash - * blocks - */ - for (i = 1; i < s.nr_rows; i++) { - struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])]; - struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])]; - - if (l.g.pg != r.g.pg) { - pr_err("nvm: sysblks not on same page. Previous update failed.\n"); - ret = -EINVAL; - goto err_cur; - } - } - - /* - * Check that there haven't been another update to the seqnr since we - * began - */ - if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) { - pr_err("nvm: seq is not sequential\n"); - ret = -EINVAL; - goto err_cur; - } - - /* - * When all pages in a block has been written, a new block is selected - * and writing is performed on the new block. - */ - if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg == - dev->lps_per_blk - 1) { - ret = nvm_prepare_new_sysblks(dev, &s); - if (ret) - goto err_cur; - } - - ret = nvm_write_and_verify(dev, new, &s); -err_cur: - kfree(cur); -err_sysblk: - mutex_unlock(&dev->mlock); - - return ret; -} - -int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - int ret; - - /* - * 1. select master blocks and select first available blks - * 2. get bad block list - * 3. mark MAX_SYSBLKS block as host-based device allocated. - * 4. write and verify data to block - */ - - if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl) - return -EINVAL; - - if (!(geo->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) { - pr_err("nvm: memory does not support SLC access\n"); - return -EINVAL; - } - - /* Index all sysblocks and mark them as host-driven */ - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1); - if (ret) - goto err_mark; - - ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_HOST); - if (ret) - goto err_mark; - - /* Write to the first block of each row */ - ret = nvm_write_and_verify(dev, info, &s); -err_mark: - mutex_unlock(&dev->mlock); - return ret; -} - -static int factory_nblks(int nblks) -{ - /* Round up to nearest BITS_PER_LONG */ - return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); -} - -static unsigned int factory_blk_offset(struct nvm_geo *geo, struct ppa_addr ppa) -{ - int nblks = factory_nblks(geo->blks_per_lun); - - return ((ppa.g.ch * geo->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) / - BITS_PER_LONG; -} - -static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - unsigned long *blk_bitmap, int flags) -{ - int i, lunoff; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - lunoff = factory_blk_offset(&dev->geo, ppa); - - /* non-set bits correspond to the block must be erased */ - for (i = 0; i < nr_blks; i++) { - switch (blks[i]) { - case NVM_BLK_T_FREE: - if (flags & NVM_FACTORY_ERASE_ONLY_USER) - set_bit(i, &blk_bitmap[lunoff]); - break; - case NVM_BLK_T_HOST: - if (!(flags & NVM_FACTORY_RESET_HOST_BLKS)) - set_bit(i, &blk_bitmap[lunoff]); - break; - case NVM_BLK_T_GRWN_BAD: - if (!(flags & NVM_FACTORY_RESET_GRWN_BBLKS)) - set_bit(i, &blk_bitmap[lunoff]); - break; - default: - set_bit(i, &blk_bitmap[lunoff]); - break; - } - } - - return 0; -} - -static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, - int max_ppas, unsigned long *blk_bitmap) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; - unsigned long *offset; - - while (!done) { - done = 1; - nvm_for_each_lun_ppa(geo, ppa, ch, lun) { - idx = factory_blk_offset(geo, ppa); - offset = &blk_bitmap[idx]; - - blkid = find_first_zero_bit(offset, geo->blks_per_lun); - if (blkid >= geo->blks_per_lun) - continue; - set_bit(blkid, offset); - - ppa.g.blk = blkid; - pr_debug("nvm: erase ppa (%u %u %u)\n", - ppa.g.ch, - ppa.g.lun, - ppa.g.blk); - - erase_list[ppa_cnt] = ppa; - ppa_cnt++; - done = 0; - - if (ppa_cnt == max_ppas) - return ppa_cnt; - } - } - - return ppa_cnt; -} - -static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap, - int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - int ch, lun, nr_blks, ret = 0; - u8 *blks; - - nr_blks = geo->blks_per_lun * geo->plane_mode; - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - nvm_for_each_lun_ppa(geo, ppa, ch, lun) { - ret = nvm_get_bb_tbl(dev, ppa, blks); - if (ret) - pr_err("nvm: failed bb tbl for ch%u lun%u\n", - ppa.g.ch, ppa.g.blk); - - ret = nvm_factory_blks(dev, ppa, blks, nr_blks, blk_bitmap, - flags); - if (ret) - break; - } - - kfree(blks); - return ret; -} - -int nvm_dev_factory(struct nvm_dev *dev, int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppas; - int ppa_cnt, ret = -ENOMEM; - int max_ppas = dev->ops->max_phys_sect / geo->nr_planes; - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - unsigned long *blk_bitmap; - - blk_bitmap = kzalloc(factory_nblks(geo->blks_per_lun) * geo->nr_luns, - GFP_KERNEL); - if (!blk_bitmap) - return ret; - - ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); - if (!ppas) - goto err_blks; - - /* create list of blks to be erased */ - ret = nvm_fact_select_blks(dev, blk_bitmap, flags); - if (ret) - goto err_ppas; - - /* continue to erase until list of blks until empty */ - while ((ppa_cnt = - nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0) - nvm_erase_ppa(dev, ppas, ppa_cnt, 0); - - /* mark host reserved blocks free */ - if (flags & NVM_FACTORY_RESET_HOST_BLKS) { - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (!ret) - ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); - mutex_unlock(&dev->mlock); - } -err_ppas: - kfree(ppas); -err_blks: - kfree(blk_bitmap); - return ret; -} -EXPORT_SYMBOL(nvm_dev_factory); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 76d20875503c..01035e718c1c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; return s; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e04c61e0839e..5b9cf56de8ef 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -787,8 +787,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); - if (cache->need_tick_bio && - !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && + if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -828,11 +827,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -static int bio_triggers_commit(struct cache *cache, struct bio *bio) -{ - return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); -} - /* * You must increment the deferred set whilst the prison cell is held. To * encourage this, we ask for 'cell' to be passed in. @@ -884,7 +878,7 @@ static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; - if (!bio_triggers_commit(cache, bio)) { + if (!op_is_flush(bio->bi_opf)) { accounted_request(cache, bio); return; } @@ -1069,8 +1063,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { - return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d1c05c12a9db..110982db4b48 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + return op_is_flush(bio->bi_opf) && dm_thin_changed_this_transaction(tc->td); } @@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { inc_all_io_entry(info->tc->pool, bio); @@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if ((bio_data_dir(bio) == WRITE) || - (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD)) + if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) || + bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; @@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) { + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a3c3e32a704..138c6fa00cd5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -784,6 +784,13 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_sg_io(ns, (void __user *)arg); #endif default: +#ifdef CONFIG_NVM + if (ns->ndev) + return nvme_nvm_ioctl(ns, cmd, arg); +#endif + if (is_sed_ioctl(cmd)) + return sed_ioctl(ns->ctrl->opal_dev, cmd, + (void __user *) arg); return -ENOTTY; } } @@ -1051,6 +1058,28 @@ static const struct pr_ops nvme_pr_ops = { .pr_clear = nvme_pr_clear, }; +#ifdef CONFIG_BLK_SED_OPAL +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send) +{ + struct nvme_ctrl *ctrl = data; + struct nvme_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + if (send) + cmd.common.opcode = nvme_admin_security_send; + else + cmd.common.opcode = nvme_admin_security_recv; + cmd.common.nsid = 0; + cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw10[1] = cpu_to_le32(len); + + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); +} +EXPORT_SYMBOL_GPL(nvme_sec_submit); +#endif /* CONFIG_BLK_SED_OPAL */ + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -1230,6 +1259,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 588d4a34c083..21cac8523bd8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -26,6 +26,8 @@ #include <linux/bitops.h> #include <linux/lightnvm.h> #include <linux/vmalloc.h> +#include <linux/sched/sysctl.h> +#include <uapi/linux/lightnvm.h> enum nvme_nvm_admin_opcode { nvme_nvm_admin_identity = 0xe2, @@ -248,50 +250,48 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) { struct nvme_nvm_id_group *src; struct nvm_id_group *dst; - int i, end; - - end = min_t(u32, 4, nvm_id->cgrps); - - for (i = 0; i < end; i++) { - src = &nvme_nvm_id->groups[i]; - dst = &nvm_id->groups[i]; - - dst->mtype = src->mtype; - dst->fmtype = src->fmtype; - dst->num_ch = src->num_ch; - dst->num_lun = src->num_lun; - dst->num_pln = src->num_pln; - - dst->num_pg = le16_to_cpu(src->num_pg); - dst->num_blk = le16_to_cpu(src->num_blk); - dst->fpg_sz = le16_to_cpu(src->fpg_sz); - dst->csecs = le16_to_cpu(src->csecs); - dst->sos = le16_to_cpu(src->sos); - - dst->trdt = le32_to_cpu(src->trdt); - dst->trdm = le32_to_cpu(src->trdm); - dst->tprt = le32_to_cpu(src->tprt); - dst->tprm = le32_to_cpu(src->tprm); - dst->tbet = le32_to_cpu(src->tbet); - dst->tbem = le32_to_cpu(src->tbem); - dst->mpos = le32_to_cpu(src->mpos); - dst->mccap = le32_to_cpu(src->mccap); - - dst->cpar = le16_to_cpu(src->cpar); - - if (dst->fmtype == NVM_ID_FMTYPE_MLC) { - memcpy(dst->lptbl.id, src->lptbl.id, 8); - dst->lptbl.mlc.num_pairs = - le16_to_cpu(src->lptbl.mlc.num_pairs); - - if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { - pr_err("nvm: number of MLC pairs not supported\n"); - return -EINVAL; - } - memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs); + if (nvme_nvm_id->cgrps != 1) + return -EINVAL; + + src = &nvme_nvm_id->groups[0]; + dst = &nvm_id->grp; + + dst->mtype = src->mtype; + dst->fmtype = src->fmtype; + dst->num_ch = src->num_ch; + dst->num_lun = src->num_lun; + dst->num_pln = src->num_pln; + + dst->num_pg = le16_to_cpu(src->num_pg); + dst->num_blk = le16_to_cpu(src->num_blk); + dst->fpg_sz = le16_to_cpu(src->fpg_sz); + dst->csecs = le16_to_cpu(src->csecs); + dst->sos = le16_to_cpu(src->sos); + + dst->trdt = le32_to_cpu(src->trdt); + dst->trdm = le32_to_cpu(src->trdm); + dst->tprt = le32_to_cpu(src->tprt); + dst->tprm = le32_to_cpu(src->tprm); + dst->tbet = le32_to_cpu(src->tbet); + dst->tbem = le32_to_cpu(src->tbem); + dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); + + dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; } + + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs); } return 0; @@ -321,7 +321,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) nvm_id->ver_id = nvme_nvm_id->ver_id; nvm_id->vmnt = nvme_nvm_id->vmnt; - nvm_id->cgrps = nvme_nvm_id->cgrps; nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap); nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom); memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf, @@ -372,7 +371,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, } /* Transform physical address to target address space */ - nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb); + nvm_part_to_tgt(nvmdev, entries, cmd_nlb); if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { ret = -EINTR; @@ -485,7 +484,8 @@ static void nvme_nvm_end_io(struct request *rq, int error) struct nvm_rq *rqd = rq->end_io_data; rqd->ppa_status = nvme_req(rq)->result.u64; - nvm_end_io(rqd, error); + rqd->error = error; + nvm_end_io(rqd); kfree(nvme_req(rq)->cmd); blk_mq_free_request(rq); @@ -586,6 +586,224 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; +static void nvme_nvm_end_user_vio(struct request *rq, int error) +{ + struct completion *waiting = rq->end_io_data; + + complete(waiting); +} + +static int nvme_nvm_submit_user_cmd(struct request_queue *q, + struct nvme_ns *ns, + struct nvme_nvm_command *vcmd, + void __user *ubuf, unsigned int bufflen, + void __user *meta_buf, unsigned int meta_len, + void __user *ppa_buf, unsigned int ppa_len, + u32 *result, u64 *status, unsigned int timeout) +{ + bool write = nvme_is_write((struct nvme_command *)vcmd); + struct nvm_dev *dev = ns->ndev; + struct gendisk *disk = ns->disk; + struct request *rq; + struct bio *bio = NULL; + __le64 *ppa_list = NULL; + dma_addr_t ppa_dma; + __le64 *metadata = NULL; + dma_addr_t metadata_dma; + DECLARE_COMPLETION_ONSTACK(wait); + int ret; + + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, + NVME_QID_ANY); + if (IS_ERR(rq)) { + ret = -ENOMEM; + goto err_cmd; + } + + rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; + rq->end_io_data = &wait; + + if (ppa_buf && ppa_len) { + ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); + if (!ppa_list) { + ret = -ENOMEM; + goto err_rq; + } + if (copy_from_user(ppa_list, (void __user *)ppa_buf, + sizeof(u64) * (ppa_len + 1))) { + ret = -EFAULT; + goto err_ppa; + } + vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); + } else { + vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); + } + + if (ubuf && bufflen) { + ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); + if (ret) + goto err_ppa; + bio = rq->bio; + + if (meta_buf && meta_len) { + metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, + &metadata_dma); + if (!metadata) { + ret = -ENOMEM; + goto err_map; + } + + if (write) { + if (copy_from_user(metadata, + (void __user *)meta_buf, + meta_len)) { + ret = -EFAULT; + goto err_meta; + } + } + vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); + } + + if (!disk) + goto submit; + + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto err_meta; + } + } + +submit: + blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); + + wait_for_completion_io(&wait); + + ret = nvme_error_status(rq->errors); + if (result) + *result = rq->errors & 0x7ff; + if (status) + *status = le64_to_cpu(nvme_req(rq)->result.u64); + + if (metadata && !ret && !write) { + if (copy_to_user(meta_buf, (void *)metadata, meta_len)) + ret = -EFAULT; + } +err_meta: + if (meta_buf && meta_len) + dma_pool_free(dev->dma_pool, metadata, metadata_dma); +err_map: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } +err_ppa: + if (ppa_buf && ppa_len) + dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); +err_rq: + blk_mq_free_request(rq); +err_cmd: + return ret; +} + +static int nvme_nvm_submit_vio(struct nvme_ns *ns, + struct nvm_user_vio __user *uvio) +{ + struct nvm_user_vio vio; + struct nvme_nvm_command c; + unsigned int length; + int ret; + + if (copy_from_user(&vio, uvio, sizeof(vio))) + return -EFAULT; + if (vio.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.ph_rw.opcode = vio.opcode; + c.ph_rw.nsid = cpu_to_le32(ns->ns_id); + c.ph_rw.control = cpu_to_le16(vio.control); + c.ph_rw.length = cpu_to_le16(vio.nppas); + + length = (vio.nppas + 1) << ns->lba_shift; + + ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, + (void __user *)(uintptr_t)vio.addr, length, + (void __user *)(uintptr_t)vio.metadata, + vio.metadata_len, + (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, + &vio.result, &vio.status, 0); + + if (ret && copy_to_user(uvio, &vio, sizeof(vio))) + return -EFAULT; + + return ret; +} + +static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, + struct nvm_passthru_vio __user *uvcmd) +{ + struct nvm_passthru_vio vcmd; + struct nvme_nvm_command c; + struct request_queue *q; + unsigned int timeout = 0; + int ret; + + if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) + return -EFAULT; + if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) + return -EACCES; + if (vcmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = vcmd.opcode; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); + /* cdw11-12 */ + c.ph_rw.length = cpu_to_le16(vcmd.nppas); + c.ph_rw.control = cpu_to_le32(vcmd.control); + c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); + + if (vcmd.timeout_ms) + timeout = msecs_to_jiffies(vcmd.timeout_ms); + + q = admin ? ns->ctrl->admin_q : ns->queue; + + ret = nvme_nvm_submit_user_cmd(q, ns, + (struct nvme_nvm_command *)&c, + (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, + (void __user *)(uintptr_t)vcmd.metadata, + vcmd.metadata_len, + (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, + &vcmd.result, &vcmd.status, timeout); + + if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) + return -EFAULT; + + return ret; +} + +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NVME_NVM_IOCTL_ADMIN_VIO: + return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + case NVME_NVM_IOCTL_IO_VIO: + return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + case NVME_NVM_IOCTL_SUBMIT_VIO: + return nvme_nvm_submit_vio(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { struct request_queue *q = ns->queue; @@ -622,7 +840,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return 0; id = &ndev->identity; - grp = &id->groups[0]; + grp = &id->grp; attr = &dattr->attr; if (strcmp(attr->name, "version") == 0) { @@ -633,10 +851,9 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); } else if (strcmp(attr->name, "device_mode") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); + /* kept for compatibility */ } else if (strcmp(attr->name, "media_manager") == 0) { - if (!ndev->mt) - return scnprintf(page, PAGE_SIZE, "%s\n", "none"); - return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); + return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); } else if (strcmp(attr->name, "ppa_format") == 0) { return scnprintf(page, PAGE_SIZE, "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index aead6d08ed2c..14cfc6f7facb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,7 @@ #include <linux/kref.h> #include <linux/blk-mq.h> #include <linux/lightnvm.h> +#include <linux/sed-opal.h> enum { /* @@ -125,6 +126,8 @@ struct nvme_ctrl { struct list_head node; struct ida ns_ida; + struct opal_dev *opal_dev; + char name[12]; char serial[20]; char model[40]; @@ -137,6 +140,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u16 oncs; u16 vid; + u16 oacs; atomic_t abort_limit; u8 event_limit; u8 vwc; @@ -267,6 +271,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send); + #define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); @@ -318,6 +325,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); int nvme_nvm_register_sysfs(struct nvme_ns *ns); void nvme_nvm_unregister_sysfs(struct nvme_ns *ns); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -335,6 +343,11 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i { return 0; } +static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} #endif /* CONFIG_NVM */ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3faefabf339c..d67d0d0a3bc0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -43,6 +43,7 @@ #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> +#include <linux/sed-opal.h> #include "nvme.h" @@ -895,12 +896,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - iod->aborted = 1; - if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } + iod->aborted = 1; memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; @@ -1178,6 +1178,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1738,6 +1739,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); + kfree(dev->ctrl.opal_dev); kfree(dev); } @@ -1754,6 +1756,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) @@ -1786,6 +1789,14 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) { + dev->ctrl.opal_dev = + init_opal_dev(&dev->ctrl, &nvme_sec_submit); + } + + if (was_suspend) + opal_unlock_from_suspend(dev->ctrl.opal_dev); + result = nvme_setup_io_queues(dev); if (result) goto out; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 94352e4df831..013bfe049a48 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -117,7 +117,7 @@ static unsigned int sr_check_events(struct cdrom_device_info *cdi, unsigned int clearing, int slot); static int sr_packet(struct cdrom_device_info *, struct packet_command *); -static struct cdrom_device_ops sr_dops = { +static const struct cdrom_device_ops sr_dops = { .open = sr_open, .release = sr_release, .drive_status = sr_drive_status, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4a2ab5d99ff7..8e4df3d6c8cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,6 +22,7 @@ struct blk_mq_hw_ctx { unsigned long flags; /* BLK_MQ_F_* flags */ + void *sched_data; struct request_queue *queue; struct blk_flush_queue *fq; @@ -35,6 +36,7 @@ struct blk_mq_hw_ctx { atomic_t wait_index; struct blk_mq_tags *tags; + struct blk_mq_tags *sched_tags; struct srcu_struct queue_rq_srcu; @@ -60,7 +62,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { unsigned int *mq_map; - struct blk_mq_ops *ops; + const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; @@ -151,11 +153,13 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, + BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_MAX_DEPTH = 10240, @@ -179,14 +183,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ + BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; struct request *blk_mq_alloc_request(struct request_queue *q, int rw, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 519ea2c9df61..37c9a43c5e78 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -221,6 +221,15 @@ static inline bool op_is_write(unsigned int op) } /* + * Check if the bio or request is one that needs special treatment in the + * flush state machine. + */ +static inline bool op_is_flush(unsigned int op) +{ + return op & (REQ_FUA | REQ_PREFLUSH); +} + +/* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the * REQ_SYNC flag. @@ -232,22 +241,29 @@ static inline bool op_is_sync(unsigned int op) } typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_NONE -1U +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { return cookie != BLK_QC_T_NONE; } -static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num) +static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num, + bool internal) { - return tag | (queue_num << BLK_QC_T_SHIFT); + blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT); + + if (internal) + ret |= BLK_QC_T_INTERNAL; + + return ret; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) { - return cookie >> BLK_QC_T_SHIFT; + return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; } static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) @@ -255,6 +271,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +static inline bool blk_qc_t_is_internal(blk_qc_t cookie) +{ + return (cookie & BLK_QC_T_INTERNAL) != 0; +} + struct blk_issue_stat { u64 time; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ca8e8fd1078..05675b1dfd20 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -154,6 +154,7 @@ struct request { /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ + int tag; sector_t __sector; /* sector cursor */ struct bio *bio; @@ -220,9 +221,10 @@ struct request { unsigned short ioprio; + int internal_tag; + void *special; /* opaque pointer available for LLD use */ - int tag; int errors; /* @@ -407,7 +409,7 @@ struct request_queue { dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; - struct blk_mq_ops *mq_ops; + const struct blk_mq_ops *mq_ops; unsigned int *mq_map; @@ -569,6 +571,11 @@ struct request_queue { struct list_head tag_set_list; struct bio_set *bio_split; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *mq_debugfs_dir; +#endif + bool mq_sysfs_init_done; }; @@ -600,6 +607,7 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -1620,6 +1628,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +/* + * Check if the two bvecs from two bios can be merged to one segment. + * If yes, no need to check gap between the two bios since the 1st bio + * and the 1st bvec in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + return false; + if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { @@ -1629,7 +1656,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, bio_get_last_bvec(prev, &pb); bio_get_first_bvec(next, &nb); - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + if (!bios_segs_mergeable(q, prev, &pb, &nb)) + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); } return false; diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 8609d577bb66..6e8f209a6dff 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -36,7 +36,7 @@ struct packet_command /* Uniform cdrom data structures for cdrom.c */ struct cdrom_device_info { - struct cdrom_device_ops *ops; /* link to device_ops */ + const struct cdrom_device_ops *ops; /* link to device_ops */ struct list_head list; /* linked list of all device_info */ struct gendisk *disk; /* matching block layer disk */ void *handle; /* driver-dependent data */ @@ -87,7 +87,6 @@ struct cdrom_device_ops { /* driver specifications */ const int capability; /* capability flags */ - int n_minors; /* number of active minor devices */ /* handle uniform packets for scsi type devices (scsi,atapi) */ int (*generic_packet) (struct cdrom_device_info *, struct packet_command *); @@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi, int page_code, int page_control); extern void init_cdrom_command(struct packet_command *cgc, void *buffer, int len, int type); +extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc); /* The SCSI spec says there could be 256 slots. */ #define CDROM_MAX_SLOTS 256 diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b276e9ef0e0b..b5825c4f06f7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -77,6 +77,34 @@ struct elevator_ops elevator_registered_fn *elevator_registered_fn; }; +struct blk_mq_alloc_data; +struct blk_mq_hw_ctx; + +struct elevator_mq_ops { + int (*init_sched)(struct request_queue *, struct elevator_type *); + void (*exit_sched)(struct elevator_queue *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); + int (*request_merge)(struct request_queue *q, struct request **, struct bio *); + void (*request_merged)(struct request_queue *, struct request *, int); + void (*requests_merged)(struct request_queue *, struct request *, struct request *); + struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); + void (*put_request)(struct request *); + void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); + struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); + bool (*has_work)(struct blk_mq_hw_ctx *); + void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*started_request)(struct request *); + void (*requeue_request)(struct request *); + struct request *(*former_request)(struct request_queue *, struct request *); + struct request *(*next_request)(struct request_queue *, struct request *); + int (*get_rq_priv)(struct request_queue *, struct request *); + void (*put_rq_priv)(struct request_queue *, struct request *); + void (*init_icq)(struct io_cq *); + void (*exit_icq)(struct io_cq *); +}; + #define ELV_NAME_MAX (16) struct elv_fs_entry { @@ -94,12 +122,16 @@ struct elevator_type struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ - struct elevator_ops ops; + union { + struct elevator_ops sq; + struct elevator_mq_ops mq; + } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; + bool uses_mq; /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ @@ -123,6 +155,7 @@ struct elevator_queue struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; + unsigned int uses_mq:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; @@ -139,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *, extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7c273bbc5351..ca45e4a088a9 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -80,8 +80,6 @@ struct nvm_dev_ops { unsigned int max_phys_sect; }; - - #ifdef CONFIG_NVM #include <linux/blkdev.h> @@ -109,6 +107,7 @@ enum { NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_ERR_FAILCRC = 0x4004, NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ @@ -202,11 +201,10 @@ struct nvm_addr_format { struct nvm_id { u8 ver_id; u8 vmnt; - u8 cgrps; u32 cap; u32 dom; struct nvm_addr_format ppaf; - struct nvm_id_group groups[4]; + struct nvm_id_group grp; } __packed; struct nvm_target { @@ -216,10 +214,6 @@ struct nvm_target { struct gendisk *disk; }; -struct nvm_tgt_instance { - struct nvm_tgt_type *tt; -}; - #define ADDR_EMPTY (~0ULL) #define NVM_VERSION_MAJOR 1 @@ -230,7 +224,6 @@ struct nvm_rq; typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { - struct nvm_tgt_instance *ins; struct nvm_tgt_dev *dev; struct bio *bio; @@ -254,6 +247,8 @@ struct nvm_rq { u64 ppa_status; /* ppa media status */ int error; + + void *private; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -272,15 +267,6 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -/* system block cpu representation */ -struct nvm_sb_info { - unsigned long seqnr; - unsigned long erase_cnt; - unsigned int version; - char mmtype[NVM_MMTYPE_LEN]; - struct ppa_addr fs_ppa; -}; - /* Device generic information */ struct nvm_geo { int nr_chnls; @@ -308,6 +294,7 @@ struct nvm_geo { int sec_per_lun; }; +/* sub-device structure */ struct nvm_tgt_dev { /* Device information */ struct nvm_geo geo; @@ -329,17 +316,10 @@ struct nvm_dev { struct list_head devices; - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - /* Device information */ struct nvm_geo geo; - /* lower page table */ + /* lower page table */ int lps_per_blk; int *lptbl; @@ -359,6 +339,10 @@ struct nvm_dev { struct mutex mlock; spinlock_t lock; + + /* target management */ + struct list_head area_list; + struct list_head targets; }; static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, @@ -391,10 +375,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, return l; } -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; @@ -407,10 +391,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, return l; } -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = 0; @@ -452,15 +436,12 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) (ppa1.g.blk == ppa2.g.blk)); } -static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) -{ - return dev->lptbl[slc_pg]; -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); +typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); +typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); struct nvm_tgt_type { const char *name; @@ -469,12 +450,15 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; nvm_tgt_exit_fn *exit; + /* sysfs */ + nvm_tgt_sysfs_init_fn *sysfs_init; + nvm_tgt_sysfs_exit_fn *sysfs_exit; + /* For internal use */ struct list_head list; }; @@ -487,103 +471,29 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); -typedef int (nvmm_register_fn)(struct nvm_dev *); -typedef void (nvmm_unregister_fn)(struct nvm_dev *); - -typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); -typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); -typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); -typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); -typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, - struct ppa_addr, int); -typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); - -enum { - TRANS_TGT_TO_DEV = 0x0, - TRANS_DEV_TO_TGT = 0x1, -}; - -struct nvmm_type { - const char *name; - unsigned int version[3]; - - nvmm_register_fn *register_mgr; - nvmm_unregister_fn *unregister_mgr; - - nvmm_create_tgt_fn *create_tgt; - nvmm_remove_tgt_fn *remove_tgt; - - nvmm_submit_io_fn *submit_io; - nvmm_erase_blk_fn *erase_blk; - - nvmm_get_area_fn *get_area; - nvmm_put_area_fn *put_area; - - nvmm_trans_ppa_fn *trans_ppa; - nvmm_part_to_tgt_fn *part_to_tgt; - - struct list_head list; -}; - -extern int nvm_register_mgr(struct nvmm_type *); -extern void nvm_unregister_mgr(struct nvmm_type *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); -extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); -extern void nvm_end_io(struct nvm_rq *, int); -extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, - void *, int); -extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, - int, void *, int); +extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); -extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -/* sysblk.c */ -#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ - -/* system block on disk representation */ -struct nvm_system_block { - __be32 magic; /* magic signature */ - __be32 seqnr; /* sequence number */ - __be32 erase_cnt; /* erase count */ - __be16 version; /* version number */ - u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ - __be64 fs_ppa; /* PPA for media manager - * superblock */ -}; - -extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); - extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ - (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ - (lunid)++, (ppa).g.lun = (lunid)) +extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d1c6f1b15c9..00eac863a9c7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -244,6 +244,7 @@ enum { NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, }; struct nvme_lbaf { diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index f017fd6e69c4..d4e0a204c118 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -259,6 +259,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) unsigned int sbitmap_weight(const struct sbitmap *sb); /** + * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_show(struct sbitmap *sb, struct seq_file *m); + +/** + * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct + * seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The output isn't guaranteed to be internally + * consistent. + */ +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); + +/** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. @@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); +/** + * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct + * seq_file. + * @sbq: Bitmap queue to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); + #endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h new file mode 100644 index 000000000000..deee23d012e7 --- /dev/null +++ b/include/linux/sed-opal.h @@ -0,0 +1,70 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli <rafael.antognolli@intel.com> + * Scott Bauer <scott.bauer@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef LINUX_OPAL_H +#define LINUX_OPAL_H + +#include <uapi/linux/sed-opal.h> +#include <linux/kernel.h> + +struct opal_dev; + +typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer, + size_t len, bool send); + +#ifdef CONFIG_BLK_SED_OPAL +bool opal_unlock_from_suspend(struct opal_dev *dev); +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); + +static inline bool is_sed_ioctl(unsigned int cmd) +{ + switch (cmd) { + case IOC_OPAL_SAVE: + case IOC_OPAL_LOCK_UNLOCK: + case IOC_OPAL_TAKE_OWNERSHIP: + case IOC_OPAL_ACTIVATE_LSP: + case IOC_OPAL_SET_PW: + case IOC_OPAL_ACTIVATE_USR: + case IOC_OPAL_REVERT_TPR: + case IOC_OPAL_LR_SETUP: + case IOC_OPAL_ADD_USR_TO_LR: + case IOC_OPAL_ENABLE_DISABLE_MBR: + case IOC_OPAL_ERASE_LR: + case IOC_OPAL_SECURE_ERASE_LR: + return true; + } + return false; +} +#else +static inline bool is_sed_ioctl(unsigned int cmd) +{ + return false; +} + +static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, + void __user *ioctl_ptr) +{ + return 0; +} +static inline bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + return false; +} +#define init_opal_dev(data, send_recv) NULL +#endif /* CONFIG_BLK_SED_OPAL */ +#endif /* LINUX_OPAL_H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 774a43128a7a..fd19f36b3129 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -122,6 +122,44 @@ struct nvm_ioctl_dev_factory { __u32 flags; }; +struct nvm_user_vio { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nppas; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 ppa_list; + __u32 metadata_len; + __u32 data_len; + __u64 status; + __u32 result; + __u32 rsvd3[3]; +}; + +struct nvm_passthru_vio { + __u8 opcode; + __u8 flags; + __u8 rsvd[2]; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u64 ppa_list; + __u16 nppas; + __u16 control; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u64 status; + __u32 result; + __u32 timeout_ms; +}; + /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { /* top level cmds */ @@ -137,6 +175,11 @@ enum { /* Factory reset device */ NVM_DEV_FACTORY_CMD, + + /* Vector user I/O */ + NVM_DEV_VIO_ADMIN_CMD = 0x41, + NVM_DEV_VIO_CMD = 0x42, + NVM_DEV_VIO_USER_CMD = 0x43, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -154,6 +197,13 @@ enum { #define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ struct nvm_ioctl_dev_factory) +#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\ + struct nvm_user_vio) + #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCHLEVEL 0 diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h new file mode 100644 index 000000000000..c72e0735532d --- /dev/null +++ b/include/uapi/linux/sed-opal.h @@ -0,0 +1,119 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli <rafael.antognolli@intel.com> + * Scott Bauer <scott.bauer@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_SED_OPAL_H +#define _UAPI_SED_OPAL_H + +#include <linux/types.h> + +#define OPAL_KEY_MAX 256 +#define OPAL_MAX_LRS 9 + +enum opal_mbr { + OPAL_MBR_ENABLE = 0x0, + OPAL_MBR_DISABLE = 0x01, +}; + +enum opal_user { + OPAL_ADMIN1 = 0x0, + OPAL_USER1 = 0x01, + OPAL_USER2 = 0x02, + OPAL_USER3 = 0x03, + OPAL_USER4 = 0x04, + OPAL_USER5 = 0x05, + OPAL_USER6 = 0x06, + OPAL_USER7 = 0x07, + OPAL_USER8 = 0x08, + OPAL_USER9 = 0x09, +}; + +enum opal_lock_state { + OPAL_RO = 0x01, /* 0001 */ + OPAL_RW = 0x02, /* 0010 */ + OPAL_LK = 0x04, /* 0100 */ +}; + +struct opal_key { + __u8 lr; + __u8 key_len; + __u8 __align[6]; + __u8 key[OPAL_KEY_MAX]; +}; + +struct opal_lr_act { + struct opal_key key; + __u32 sum; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 align[2]; /* Align to 8 byte boundary */ +}; + +struct opal_session_info { + __u32 sum; + __u32 who; + struct opal_key opal_key; +}; + +struct opal_user_lr_setup { + __u64 range_start; + __u64 range_length; + __u32 RLE; /* Read Lock enabled */ + __u32 WLE; /* Write Lock Enabled */ + struct opal_session_info session; +}; + +struct opal_lock_unlock { + struct opal_session_info session; + __u32 l_state; + __u8 __align[4]; +}; + +struct opal_new_pw { + struct opal_session_info session; + + /* When we're not operating in sum, and we first set + * passwords we need to set them via ADMIN authority. + * After passwords are changed, we can set them via, + * User authorities. + * Because of this restriction we need to know about + * Two different users. One in 'session' which we will use + * to start the session and new_userr_pw as the user we're + * chaning the pw for. + */ + struct opal_session_info new_user_pw; +}; + +struct opal_mbr_data { + struct opal_key key; + __u8 enable_disable; + __u8 __align[7]; +}; + +#define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) +#define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) +#define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) +#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_lr_act) +#define IOC_OPAL_SET_PW _IOW('p', 224, struct opal_new_pw) +#define IOC_OPAL_ACTIVATE_USR _IOW('p', 225, struct opal_session_info) +#define IOC_OPAL_REVERT_TPR _IOW('p', 226, struct opal_key) +#define IOC_OPAL_LR_SETUP _IOW('p', 227, struct opal_user_lr_setup) +#define IOC_OPAL_ADD_USR_TO_LR _IOW('p', 228, struct opal_lock_unlock) +#define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data) +#define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) +#define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) + +#endif /* _UAPI_SED_OPAL_H */ diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2cecf05c82fd..55e11c4b2f3b 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/sbitmap.h> +#include <linux/seq_file.h> int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node) @@ -180,6 +181,62 @@ unsigned int sbitmap_weight(const struct sbitmap *sb) } EXPORT_SYMBOL_GPL(sbitmap_weight); +void sbitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + seq_printf(m, "depth=%u\n", sb->depth); + seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); + seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); + seq_printf(m, "map_nr=%u\n", sb->map_nr); +} +EXPORT_SYMBOL_GPL(sbitmap_show); + +static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) +{ + if ((offset & 0xf) == 0) { + if (offset != 0) + seq_putc(m, '\n'); + seq_printf(m, "%08x:", offset); + } + if ((offset & 0x1) == 0) + seq_putc(m, ' '); + seq_printf(m, "%02x", byte); +} + +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + u8 byte = 0; + unsigned int byte_bits = 0; + unsigned int offset = 0; + int i; + + for (i = 0; i < sb->map_nr; i++) { + unsigned long word = READ_ONCE(sb->map[i].word); + unsigned int word_bits = READ_ONCE(sb->map[i].depth); + + while (word_bits > 0) { + unsigned int bits = min(8 - byte_bits, word_bits); + + byte |= (word & (BIT(bits) - 1)) << byte_bits; + byte_bits += bits; + if (byte_bits == 8) { + emit_byte(m, offset, byte); + byte = 0; + byte_bits = 0; + offset++; + } + word >>= bits; + word_bits -= bits; + } + } + if (byte_bits) { + emit_byte(m, offset, byte); + offset++; + } + if (offset) + seq_putc(m, '\n'); +} +EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); + static unsigned int sbq_calc_wake_batch(unsigned int depth) { unsigned int wake_batch; @@ -239,7 +296,19 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { - sbq->wake_batch = sbq_calc_wake_batch(depth); + unsigned int wake_batch = sbq_calc_wake_batch(depth); + int i; + + if (sbq->wake_batch != wake_batch) { + WRITE_ONCE(sbq->wake_batch, wake_batch); + /* + * Pairs with the memory barrier in sbq_wake_up() to ensure that + * the batch size is updated before the wait counts. + */ + smp_mb__before_atomic(); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) + atomic_set(&sbq->ws[i].wait_cnt, 1); + } sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); @@ -297,20 +366,39 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) static void sbq_wake_up(struct sbitmap_queue *sbq) { struct sbq_wait_state *ws; + unsigned int wake_batch; int wait_cnt; - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); + /* + * Pairs with the memory barrier in set_current_state() to ensure the + * proper ordering of clear_bit()/waitqueue_active() in the waker and + * test_and_set_bit()/prepare_to_wait()/finish_wait() in the waiter. See + * the comment on waitqueue_active(). This is __after_atomic because we + * just did clear_bit() in the caller. + */ + smp_mb__after_atomic(); ws = sbq_wake_ptr(sbq); if (!ws) return; wait_cnt = atomic_dec_return(&ws->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&ws->wait_cnt); - if (wait_cnt == 0) { - atomic_add(sbq->wake_batch, &ws->wait_cnt); + if (wait_cnt <= 0) { + wake_batch = READ_ONCE(sbq->wake_batch); + /* + * Pairs with the memory barrier in sbitmap_queue_resize() to + * ensure that we see the batch size update before the wait + * count is reset. + */ + smp_mb__before_atomic(); + /* + * If there are concurrent callers to sbq_wake_up(), the last + * one to decrement the wait count below zero will bump it back + * up. If there is a concurrent resize, the count reset will + * either cause the cmpxchg to fail or overwrite after the + * cmpxchg. + */ + atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); sbq_index_atomic_inc(&sbq->wake_index); wake_up(&ws->wait); } @@ -331,7 +419,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) int i, wake_index; /* - * Make sure all changes prior to this are visible from other CPUs. + * Pairs with the memory barrier in set_current_state() like in + * sbq_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); @@ -345,3 +434,37 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); + +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) +{ + bool first; + int i; + + sbitmap_show(&sbq->sb, m); + + seq_puts(m, "alloc_hint={"); + first = true; + for_each_possible_cpu(i) { + if (!first) + seq_puts(m, ", "); + first = false; + seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i)); + } + seq_puts(m, "}\n"); + + seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); + seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); + + seq_puts(m, "ws={\n"); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[i]; + + seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", + atomic_read(&ws->wait_cnt), + waitqueue_active(&ws->wait) ? "active" : "inactive"); + } + seq_puts(m, "}\n"); + + seq_printf(m, "round_robin=%d\n", sbq->round_robin); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_show); |