From e4043dcf30811f5db15181168e2aac172514302a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 9 Apr 2014 10:18:23 -0600
Subject: blk-mq: ensure that hardware queues are always run on the mapped CPUs

Instead of providing soft mappings with no guarantees on hardware
queues always being run on the right CPU, switch to a hard mapping
guarantee that ensure that we always run the hardware queue on
(one of, if more) the mapped CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/blk-mq.h')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451545d8..b6ee48740458 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -19,6 +19,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct delayed_work	delayed_work;
+	cpumask_var_t		cpumask;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
-- 
cgit v1.2.3


From e9b267d91f6ddbc694cb40aa962b0b2cec03971d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Apr 2014 13:59:10 -0600
Subject: blk-mq: add ->init_request and ->exit_request methods

The current blk_mq_init_commands/blk_mq_free_commands interface has a
two problems:

 1) Because only the constructor is passed to blk_mq_init_commands there
    is no easy way to clean up when a comman initialization failed.  The
    current code simply leaks the allocations done in the constructor.

 2) There is no good place to call blk_mq_free_commands: before
    blk_cleanup_queue there is no guarantee that all outstanding
    commands have completed, so we can't free them yet.  After
    blk_cleanup_queue the queue has usually been freed.  This can be
    worked around by grabbing an unconditional reference before calling
    blk_cleanup_queue and dropping it after blk_mq_free_commands is
    done, although that's not exatly pretty and driver writers are
    guaranteed to get it wrong sooner or later.

Both issues are easily fixed by making the request constructor and
destructor normal blk_mq_ops methods.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c             | 105 ++++++++++++++-------------------------------
 drivers/block/virtio_blk.c |  23 +++++-----
 include/linux/blk-mq.h     |  14 +++++-
 3 files changed, 55 insertions(+), 87 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e644feec068c..48d2d8495f5e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1031,74 +1031,20 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	blk_mq_put_ctx(ctx);
 }
 
-static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
-				   int (*init)(void *, struct blk_mq_hw_ctx *,
-					struct request *, unsigned int),
-				   void *data)
+static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx, void *driver_data)
 {
-	unsigned int i;
-	int ret = 0;
-
-	for (i = 0; i < hctx->queue_depth; i++) {
-		struct request *rq = hctx->rqs[i];
-
-		ret = init(data, hctx, rq, i);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-int blk_mq_init_commands(struct request_queue *q,
-			 int (*init)(void *, struct blk_mq_hw_ctx *,
-					struct request *, unsigned int),
-			 void *data)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	int ret = 0;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		ret = blk_mq_init_hw_commands(hctx, init, data);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(blk_mq_init_commands);
-
-static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
-				    void (*free)(void *, struct blk_mq_hw_ctx *,
-					struct request *, unsigned int),
-				    void *data)
-{
-	unsigned int i;
+	struct page *page;
 
-	for (i = 0; i < hctx->queue_depth; i++) {
-		struct request *rq = hctx->rqs[i];
+	if (hctx->rqs && hctx->queue->mq_ops->exit_request) {
+		int i;
 
-		free(data, hctx, rq, i);
+		for (i = 0; i < hctx->queue_depth; i++) {
+			if (!hctx->rqs[i])
+				continue;
+			hctx->queue->mq_ops->exit_request(driver_data, hctx,
+							  hctx->rqs[i], i);
+		}
 	}
-}
-
-void blk_mq_free_commands(struct request_queue *q,
-			  void (*free)(void *, struct blk_mq_hw_ctx *,
-					struct request *, unsigned int),
-			  void *data)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_free_hw_commands(hctx, free, data);
-}
-EXPORT_SYMBOL(blk_mq_free_commands);
-
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
-{
-	struct page *page;
 
 	while (!list_empty(&hctx->page_list)) {
 		page = list_first_entry(&hctx->page_list, struct page, lru);
@@ -1123,10 +1069,12 @@ static size_t order_to_size(unsigned int order)
 }
 
 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
-			      unsigned int reserved_tags, int node)
+		struct blk_mq_reg *reg, void *driver_data, int node)
 {
+	unsigned int reserved_tags = reg->reserved_tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
+	int error;
 
 	INIT_LIST_HEAD(&hctx->page_list);
 
@@ -1175,14 +1123,23 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		for (j = 0; j < to_do; j++) {
 			hctx->rqs[i] = p;
 			blk_rq_init(hctx->queue, hctx->rqs[i]);
+			if (reg->ops->init_request) {
+				error = reg->ops->init_request(driver_data,
+						hctx, hctx->rqs[i], i);
+				if (error)
+					goto err_rq_map;
+			}
+
 			p += rq_size;
 			i++;
 		}
 	}
 
-	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
+	if (i < (reserved_tags + BLK_MQ_TAG_MIN)) {
+		error = -ENOMEM;
 		goto err_rq_map;
-	else if (i != hctx->queue_depth) {
+	}
+	if (i != hctx->queue_depth) {
 		hctx->queue_depth = i;
 		pr_warn("%s: queue depth set to %u because of low memory\n",
 					__func__, i);
@@ -1190,12 +1147,14 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 
 	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
 	if (!hctx->tags) {
-err_rq_map:
-		blk_mq_free_rq_map(hctx);
-		return -ENOMEM;
+		error = -ENOMEM;
+		goto err_rq_map;
 	}
 
 	return 0;
+err_rq_map:
+	blk_mq_free_rq_map(hctx, driver_data);
+	return error;
 }
 
 static int blk_mq_init_hw_queues(struct request_queue *q,
@@ -1228,7 +1187,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 						blk_mq_hctx_notify, hctx);
 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-		if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+		if (blk_mq_init_rq_map(hctx, reg, driver_data, node))
 			break;
 
 		/*
@@ -1268,7 +1227,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 			reg->ops->exit_hctx(hctx, j);
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-		blk_mq_free_rq_map(hctx);
+		blk_mq_free_rq_map(hctx, driver_data);
 		kfree(hctx->ctxs);
 	}
 
@@ -1455,7 +1414,7 @@ void blk_mq_free_queue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
-		blk_mq_free_rq_map(hctx);
+		blk_mq_free_rq_map(hctx, q->queuedata);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c7d02bc9d945..d06206abd340 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -480,11 +480,22 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
+static int virtblk_init_request(void *data, struct blk_mq_hw_ctx *hctx,
+		struct request *rq, unsigned int nr)
+{
+	struct virtio_blk *vblk = data;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+	sg_init_table(vbr->sg, vblk->sg_elems);
+	return 0;
+}
+
 static struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
 	.free_hctx	= blk_mq_free_single_hw_queue,
+	.init_request	= virtblk_init_request,
 	.complete	= virtblk_request_done,
 };
 
@@ -497,16 +508,6 @@ static struct blk_mq_reg virtio_mq_reg = {
 };
 module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
 
-static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
-			     struct request *rq, unsigned int nr)
-{
-	struct virtio_blk *vblk = data;
-	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
-
-	sg_init_table(vbr->sg, vblk->sg_elems);
-	return 0;
-}
-
 static int virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
@@ -577,8 +578,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_put_disk;
 	}
 
-	blk_mq_init_commands(q, virtblk_init_vbr, vblk);
-
 	q->queuedata = vblk;
 
 	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b6ee48740458..29c1a6e83814 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -67,6 +67,10 @@ typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
 typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_request_fn)(void *, struct blk_mq_hw_ctx *,
+		struct request *, unsigned int);
+typedef void (exit_request_fn)(void *, struct blk_mq_hw_ctx *,
+		struct request *, unsigned int);
 
 struct blk_mq_ops {
 	/*
@@ -99,6 +103,14 @@ struct blk_mq_ops {
 	 */
 	init_hctx_fn		*init_hctx;
 	exit_hctx_fn		*exit_hctx;
+
+	/*
+	 * Called for every command allocated by the block layer to allow
+	 * the driver to set up driver specific data.
+	 * Ditto for exit/teardown.
+	 */
+	init_request_fn		*init_request;
+	exit_request_fn		*exit_request;
 };
 
 enum {
@@ -118,8 +130,6 @@ enum {
 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
-int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
-void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-- 
cgit v1.2.3


From 24d2f90309b23f2cfe016b2aebc5f0d6e01c57fd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Apr 2014 14:14:00 -0600
Subject: blk-mq: split out tag initialization, support shared tags

Add a new blk_mq_tag_set structure that gets set up before we initialize
the queue.  A single blk_mq_tag_set structure can be shared by multiple
queues.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Modular export of blk_mq_{alloc,free}_tagset added by me.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-cpumap.c      |   6 +-
 block/blk-mq-tag.c         |  14 ---
 block/blk-mq-tag.h         |  19 +++-
 block/blk-mq.c             | 244 +++++++++++++++++++++++++--------------------
 block/blk-mq.h             |   5 +-
 drivers/block/null_blk.c   |  92 ++++++++++-------
 drivers/block/virtio_blk.c |  48 +++++----
 include/linux/blk-mq.h     |  34 +++----
 8 files changed, 262 insertions(+), 200 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 097921329619..5d0f93cf358c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -80,17 +80,17 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
 	return 0;
 }
 
-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 {
 	unsigned int *map;
 
 	/* If cpus are offline, map them to first hctx */
 	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
-				reg->numa_node);
+				set->numa_node);
 	if (!map)
 		return NULL;
 
-	if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
 		return map;
 
 	kfree(map);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c51a27..7a799c46c32d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,25 +1,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/percpu_ida.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-/*
- * Per tagged queue (tag address space) map
- */
-struct blk_mq_tags {
-	unsigned int nr_tags;
-	unsigned int nr_reserved_tags;
-	unsigned int nr_batch_move;
-	unsigned int nr_max_cache;
-
-	struct percpu_ida free_tags;
-	struct percpu_ida reserved_tags;
-};
-
 void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
 {
 	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c6148e..b602e3fa66ea 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,7 +1,24 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
 
-struct blk_mq_tags;
+#include <linux/percpu_ida.h>
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+	unsigned int nr_batch_move;
+	unsigned int nr_max_cache;
+
+	struct percpu_ida free_tags;
+	struct percpu_ida reserved_tags;
+
+	struct request **rqs;
+	struct list_head page_list;
+};
+
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2a5a0fed10a3..9180052d42cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,7 +81,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
 
 	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = hctx->rqs[tag];
+		rq = hctx->tags->rqs[tag];
 		blk_rq_init(hctx->queue, rq);
 		rq->tag = tag;
 
@@ -404,6 +404,12 @@ static void blk_mq_requeue_request(struct request *rq)
 		rq->nr_phys_segments--;
 }
 
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+{
+	return tags->rqs[tag];
+}
+EXPORT_SYMBOL(blk_mq_tag_to_rq);
+
 struct blk_mq_timeout_data {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long *next;
@@ -425,12 +431,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 	do {
 		struct request *rq;
 
-		tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
-		if (tag >= hctx->queue_depth)
+		tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
+		if (tag >= hctx->tags->nr_tags)
 			break;
 
-		rq = hctx->rqs[tag++];
-
+		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
+		if (rq->q != hctx->queue)
+			continue;
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 			continue;
 
@@ -969,11 +976,11 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
 						   unsigned int hctx_index)
 {
 	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
-				GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+				GFP_KERNEL | __GFP_ZERO, set->numa_node);
 }
 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
 
@@ -1030,31 +1037,31 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	blk_mq_put_ctx(ctx);
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx, void *driver_data)
+static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
+		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
 	struct page *page;
 
-	if (hctx->rqs && hctx->queue->mq_ops->exit_request) {
+	if (tags->rqs && set->ops->exit_request) {
 		int i;
 
-		for (i = 0; i < hctx->queue_depth; i++) {
-			if (!hctx->rqs[i])
+		for (i = 0; i < tags->nr_tags; i++) {
+			if (!tags->rqs[i])
 				continue;
-			hctx->queue->mq_ops->exit_request(driver_data, hctx,
-							  hctx->rqs[i], i);
+			set->ops->exit_request(set->driver_data, tags->rqs[i],
+						hctx_idx, i);
 		}
 	}
 
-	while (!list_empty(&hctx->page_list)) {
-		page = list_first_entry(&hctx->page_list, struct page, lru);
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
 		__free_pages(page, page->private);
 	}
 
-	kfree(hctx->rqs);
+	kfree(tags->rqs);
 
-	if (hctx->tags)
-		blk_mq_free_tags(hctx->tags);
+	blk_mq_free_tags(tags);
 }
 
 static size_t order_to_size(unsigned int order)
@@ -1067,30 +1074,36 @@ static size_t order_to_size(unsigned int order)
 	return ret;
 }
 
-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_reg *reg, void *driver_data, int node)
+static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+		unsigned int hctx_idx)
 {
-	unsigned int reserved_tags = reg->reserved_tags;
+	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
-	int error;
 
-	INIT_LIST_HEAD(&hctx->page_list);
+	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+				set->numa_node);
+	if (!tags)
+		return NULL;
 
-	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
-					GFP_KERNEL, node);
-	if (!hctx->rqs)
-		return -ENOMEM;
+	INIT_LIST_HEAD(&tags->page_list);
+
+	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
+					GFP_KERNEL, set->numa_node);
+	if (!tags->rqs) {
+		blk_mq_free_tags(tags);
+		return NULL;
+	}
 
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
-	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+	rq_size = round_up(sizeof(struct request) + set->cmd_size,
 				cache_line_size());
-	left = rq_size * hctx->queue_depth;
+	left = rq_size * set->queue_depth;
 
-	for (i = 0; i < hctx->queue_depth;) {
+	for (i = 0; i < set->queue_depth; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
@@ -1100,7 +1113,8 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 			this_order--;
 
 		do {
-			page = alloc_pages_node(node, GFP_KERNEL, this_order);
+			page = alloc_pages_node(set->numa_node, GFP_KERNEL,
+						this_order);
 			if (page)
 				break;
 			if (!this_order--)
@@ -1110,22 +1124,22 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		} while (1);
 
 		if (!page)
-			break;
+			goto fail;
 
 		page->private = this_order;
-		list_add_tail(&page->lru, &hctx->page_list);
+		list_add_tail(&page->lru, &tags->page_list);
 
 		p = page_address(page);
 		entries_per_page = order_to_size(this_order) / rq_size;
-		to_do = min(entries_per_page, hctx->queue_depth - i);
+		to_do = min(entries_per_page, set->queue_depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
-			hctx->rqs[i] = p;
-			if (reg->ops->init_request) {
-				error = reg->ops->init_request(driver_data,
-						hctx, hctx->rqs[i], i);
-				if (error)
-					goto err_rq_map;
+			tags->rqs[i] = p;
+			if (set->ops->init_request) {
+				if (set->ops->init_request(set->driver_data,
+						tags->rqs[i], hctx_idx, i,
+						set->numa_node))
+					goto fail;
 			}
 
 			p += rq_size;
@@ -1133,30 +1147,16 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	if (i < (reserved_tags + BLK_MQ_TAG_MIN)) {
-		error = -ENOMEM;
-		goto err_rq_map;
-	}
-	if (i != hctx->queue_depth) {
-		hctx->queue_depth = i;
-		pr_warn("%s: queue depth set to %u because of low memory\n",
-					__func__, i);
-	}
+	return tags;
 
-	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
-	if (!hctx->tags) {
-		error = -ENOMEM;
-		goto err_rq_map;
-	}
-
-	return 0;
-err_rq_map:
-	blk_mq_free_rq_map(hctx, driver_data);
-	return error;
+fail:
+	pr_warn("%s: failed to allocate requests\n", __func__);
+	blk_mq_free_rq_map(set, tags, hctx_idx);
+	return NULL;
 }
 
 static int blk_mq_init_hw_queues(struct request_queue *q,
-				 struct blk_mq_reg *reg, void *driver_data)
+		struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i, j;
@@ -1170,23 +1170,21 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 
 		node = hctx->numa_node;
 		if (node == NUMA_NO_NODE)
-			node = hctx->numa_node = reg->numa_node;
+			node = hctx->numa_node = set->numa_node;
 
 		INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
 		spin_lock_init(&hctx->lock);
 		INIT_LIST_HEAD(&hctx->dispatch);
 		hctx->queue = q;
 		hctx->queue_num = i;
-		hctx->flags = reg->flags;
-		hctx->queue_depth = reg->queue_depth;
-		hctx->cmd_size = reg->cmd_size;
+		hctx->flags = set->flags;
+		hctx->cmd_size = set->cmd_size;
 
 		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 						blk_mq_hctx_notify, hctx);
 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-		if (blk_mq_init_rq_map(hctx, reg, driver_data, node))
-			break;
+		hctx->tags = set->tags[i];
 
 		/*
 		 * Allocate space for all possible cpus to avoid allocation in
@@ -1206,8 +1204,8 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		hctx->nr_ctx_map = num_maps;
 		hctx->nr_ctx = 0;
 
-		if (reg->ops->init_hctx &&
-		    reg->ops->init_hctx(hctx, driver_data, i))
+		if (set->ops->init_hctx &&
+		    set->ops->init_hctx(hctx, set->driver_data, i))
 			break;
 	}
 
@@ -1221,11 +1219,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		if (i == j)
 			break;
 
-		if (reg->ops->exit_hctx)
-			reg->ops->exit_hctx(hctx, j);
+		if (set->ops->exit_hctx)
+			set->ops->exit_hctx(hctx, j);
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-		blk_mq_free_rq_map(hctx, driver_data);
 		kfree(hctx->ctxs);
 	}
 
@@ -1290,41 +1287,25 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 }
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
-					void *driver_data)
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx **hctxs;
 	struct blk_mq_ctx *ctx;
 	struct request_queue *q;
 	int i;
 
-	if (!reg->nr_hw_queues ||
-	    !reg->ops->queue_rq || !reg->ops->map_queue ||
-	    !reg->ops->alloc_hctx || !reg->ops->free_hctx)
-		return ERR_PTR(-EINVAL);
-
-	if (!reg->queue_depth)
-		reg->queue_depth = BLK_MQ_MAX_DEPTH;
-	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
-		pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
-		reg->queue_depth = BLK_MQ_MAX_DEPTH;
-	}
-
-	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
-		return ERR_PTR(-EINVAL);
-
 	ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-			reg->numa_node);
+	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+			set->numa_node);
 
 	if (!hctxs)
 		goto err_percpu;
 
-	for (i = 0; i < reg->nr_hw_queues; i++) {
-		hctxs[i] = reg->ops->alloc_hctx(reg, i);
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		hctxs[i] = set->ops->alloc_hctx(set, i);
 		if (!hctxs[i])
 			goto err_hctxs;
 
@@ -1335,11 +1316,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 		hctxs[i]->queue_num = i;
 	}
 
-	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
 	if (!q)
 		goto err_hctxs;
 
-	q->mq_map = blk_mq_make_queue_map(reg);
+	q->mq_map = blk_mq_make_queue_map(set);
 	if (!q->mq_map)
 		goto err_map;
 
@@ -1347,33 +1328,34 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	blk_queue_rq_timeout(q, 30000);
 
 	q->nr_queues = nr_cpu_ids;
-	q->nr_hw_queues = reg->nr_hw_queues;
+	q->nr_hw_queues = set->nr_hw_queues;
 
 	q->queue_ctx = ctx;
 	q->queue_hw_ctx = hctxs;
 
-	q->mq_ops = reg->ops;
+	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 	q->sg_reserved_size = INT_MAX;
 
 	blk_queue_make_request(q, blk_mq_make_request);
-	blk_queue_rq_timed_out(q, reg->ops->timeout);
-	if (reg->timeout)
-		blk_queue_rq_timeout(q, reg->timeout);
+	blk_queue_rq_timed_out(q, set->ops->timeout);
+	if (set->timeout)
+		blk_queue_rq_timeout(q, set->timeout);
 
-	if (reg->ops->complete)
-		blk_queue_softirq_done(q, reg->ops->complete);
+	if (set->ops->complete)
+		blk_queue_softirq_done(q, set->ops->complete);
 
 	blk_mq_init_flush(q);
-	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
-				cache_line_size()), GFP_KERNEL);
+	q->flush_rq = kzalloc(round_up(sizeof(struct request) +
+				set->cmd_size, cache_line_size()),
+				GFP_KERNEL);
 	if (!q->flush_rq)
 		goto err_hw;
 
-	if (blk_mq_init_hw_queues(q, reg, driver_data))
+	if (blk_mq_init_hw_queues(q, set))
 		goto err_flush_rq;
 
 	blk_mq_map_swqueue(q);
@@ -1391,11 +1373,11 @@ err_hw:
 err_map:
 	blk_cleanup_queue(q);
 err_hctxs:
-	for (i = 0; i < reg->nr_hw_queues; i++) {
+	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (!hctxs[i])
 			break;
 		free_cpumask_var(hctxs[i]->cpumask);
-		reg->ops->free_hctx(hctxs[i], i);
+		set->ops->free_hctx(hctxs[i], i);
 	}
 	kfree(hctxs);
 err_percpu:
@@ -1412,7 +1394,6 @@ void blk_mq_free_queue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
-		blk_mq_free_rq_map(hctx, q->queuedata);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
@@ -1473,6 +1454,53 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	if (!set->nr_hw_queues)
+		return -EINVAL;
+	if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+		return -EINVAL;
+	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
+		return -EINVAL;
+
+	if (!set->nr_hw_queues ||
+	    !set->ops->queue_rq || !set->ops->map_queue ||
+	    !set->ops->alloc_hctx || !set->ops->free_hctx)
+		return -EINVAL;
+
+
+	set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags),
+				 GFP_KERNEL, set->numa_node);
+	if (!set->tags)
+		goto out;
+
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		set->tags[i] = blk_mq_init_rq_map(set, i);
+		if (!set->tags[i])
+			goto out_unwind;
+	}
+
+	return 0;
+
+out_unwind:
+	while (--i >= 0)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	for (i = 0; i < set->nr_hw_queues; i++)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+}
+EXPORT_SYMBOL(blk_mq_free_tag_set);
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7964dadb7d64..5fa14f19f752 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+struct blk_mq_tag_set;
+
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
@@ -46,8 +48,7 @@ void blk_mq_disable_hotplug(void);
 /*
  * CPU -> queue mappings
  */
-struct blk_mq_reg;
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
 
 void blk_mq_add_timer(struct request *rq);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 71df69d90900..8e7e3a0b0d24 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
 	unsigned int index;
 	struct request_queue *q;
 	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
 	struct hrtimer timer;
 	unsigned int queue_depth;
 	spinlock_t lock;
@@ -320,10 +321,11 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
+static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set,
+		unsigned int hctx_index)
 {
-	int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
-	int tip = (reg->nr_hw_queues % nr_online_nodes);
+	int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes);
+	int tip = (set->nr_hw_queues % nr_online_nodes);
 	int node = 0, i, n;
 
 	/*
@@ -338,7 +340,7 @@ static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned in
 
 			tip--;
 			if (!tip)
-				b_size = reg->nr_hw_queues / nr_online_nodes;
+				b_size = set->nr_hw_queues / nr_online_nodes;
 		}
 	}
 
@@ -387,13 +389,17 @@ static struct blk_mq_ops null_mq_ops = {
 	.map_queue      = blk_mq_map_queue,
 	.init_hctx	= null_init_hctx,
 	.complete	= null_softirq_done_fn,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
 };
 
-static struct blk_mq_reg null_mq_reg = {
-	.ops		= &null_mq_ops,
-	.queue_depth	= 64,
-	.cmd_size	= sizeof(struct nullb_cmd),
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
+static struct blk_mq_ops null_mq_ops_pernode = {
+	.queue_rq       = null_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_hctx	= null_init_hctx,
+	.complete	= null_softirq_done_fn,
+	.alloc_hctx	= null_alloc_hctx,
+	.free_hctx	= null_free_hctx,
 };
 
 static void null_del_dev(struct nullb *nullb)
@@ -402,6 +408,8 @@ static void null_del_dev(struct nullb *nullb)
 
 	del_gendisk(nullb->disk);
 	blk_cleanup_queue(nullb->q);
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
 	put_disk(nullb->disk);
 	kfree(nullb);
 }
@@ -506,7 +514,7 @@ static int null_add_dev(void)
 
 	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
 	if (!nullb)
-		return -ENOMEM;
+		goto out;
 
 	spin_lock_init(&nullb->lock);
 
@@ -514,49 +522,47 @@ static int null_add_dev(void)
 		submit_queues = nr_online_nodes;
 
 	if (setup_queues(nullb))
-		goto err;
+		goto out_free_nullb;
 
 	if (queue_mode == NULL_Q_MQ) {
-		null_mq_reg.numa_node = home_node;
-		null_mq_reg.queue_depth = hw_queue_depth;
-		null_mq_reg.nr_hw_queues = submit_queues;
-
-		if (use_per_node_hctx) {
-			null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
-			null_mq_reg.ops->free_hctx = null_free_hctx;
-		} else {
-			null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
-			null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
-		}
-
-		nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+		if (use_per_node_hctx)
+			nullb->tag_set.ops = &null_mq_ops_pernode;
+		else
+			nullb->tag_set.ops = &null_mq_ops;
+		nullb->tag_set.nr_hw_queues = submit_queues;
+		nullb->tag_set.queue_depth = hw_queue_depth;
+		nullb->tag_set.numa_node = home_node;
+		nullb->tag_set.cmd_size	= sizeof(struct nullb_cmd);
+		nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+		nullb->tag_set.driver_data = nullb;
+
+		if (blk_mq_alloc_tag_set(&nullb->tag_set))
+			goto out_cleanup_queues;
+
+		nullb->q = blk_mq_init_queue(&nullb->tag_set);
+		if (!nullb->q)
+			goto out_cleanup_tags;
 	} else if (queue_mode == NULL_Q_BIO) {
 		nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_make_request(nullb->q, null_queue_bio);
 		init_driver_queues(nullb);
 	} else {
 		nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-		if (nullb->q)
-			blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+		blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
 		init_driver_queues(nullb);
 	}
 
-	if (!nullb->q)
-		goto queue_fail;
-
 	nullb->q->queuedata = nullb;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
 
 	disk = nullb->disk = alloc_disk_node(1, home_node);
-	if (!disk) {
-queue_fail:
-		blk_cleanup_queue(nullb->q);
-		cleanup_queues(nullb);
-err:
-		kfree(nullb);
-		return -ENOMEM;
-	}
+	if (!disk)
+		goto out_cleanup_blk_queue;
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +585,18 @@ err:
 	sprintf(disk->disk_name, "nullb%d", nullb->index);
 	add_disk(disk);
 	return 0;
+
+out_cleanup_blk_queue:
+	blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+	cleanup_queues(nullb);
+out_free_nullb:
+	kfree(nullb);
+out:
+	return -ENOMEM;
 }
 
 static int __init null_init(void)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index d06206abd340..f909a8821e65 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
+	/* Block layer tags. */
+	struct blk_mq_tag_set tag_set;
+
 	/* Process context for config space updates */
 	struct work_struct config_work;
 
@@ -480,8 +483,9 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
-static int virtblk_init_request(void *data, struct blk_mq_hw_ctx *hctx,
-		struct request *rq, unsigned int nr)
+static int virtblk_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
 {
 	struct virtio_blk *vblk = data;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
@@ -495,18 +499,12 @@ static struct blk_mq_ops virtio_mq_ops = {
 	.map_queue	= blk_mq_map_queue,
 	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
 	.free_hctx	= blk_mq_free_single_hw_queue,
-	.init_request	= virtblk_init_request,
 	.complete	= virtblk_request_done,
+	.init_request	= virtblk_init_request,
 };
 
-static struct blk_mq_reg virtio_mq_reg = {
-	.ops		= &virtio_mq_ops,
-	.nr_hw_queues	= 1,
-	.queue_depth	= 0, /* Set in virtblk_probe */
-	.numa_node	= NUMA_NO_NODE,
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
-};
-module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
 
 static int virtblk_probe(struct virtio_device *vdev)
 {
@@ -562,20 +560,32 @@ static int virtblk_probe(struct virtio_device *vdev)
 	}
 
 	/* Default queue sizing is to fill the ring. */
-	if (!virtio_mq_reg.queue_depth) {
-		virtio_mq_reg.queue_depth = vblk->vq->num_free;
+	if (!virtblk_queue_depth) {
+		virtblk_queue_depth = vblk->vq->num_free;
 		/* ... but without indirect descs, we use 2 descs per req */
 		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
-			virtio_mq_reg.queue_depth /= 2;
+			virtblk_queue_depth /= 2;
 	}
-	virtio_mq_reg.cmd_size =
+
+	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+	vblk->tag_set.ops = &virtio_mq_ops;
+	vblk->tag_set.nr_hw_queues = 1;
+	vblk->tag_set.queue_depth = virtblk_queue_depth;
+	vblk->tag_set.numa_node = NUMA_NO_NODE;
+	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	vblk->tag_set.cmd_size =
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
+	vblk->tag_set.driver_data = vblk;
 
-	q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+	err = blk_mq_alloc_tag_set(&vblk->tag_set);
+	if (err)
+		goto out_put_disk;
+
+	q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
 	if (!q) {
 		err = -ENOMEM;
-		goto out_put_disk;
+		goto out_free_tags;
 	}
 
 	q->queuedata = vblk;
@@ -678,6 +688,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 out_del_disk:
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+	blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
 	put_disk(vblk->disk);
 out_free_vq:
@@ -704,6 +716,8 @@ static void virtblk_remove(struct virtio_device *vdev)
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
 
+	blk_mq_free_tag_set(&vblk->tag_set);
+
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 29c1a6e83814..a4ea0ce83b07 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,8 +33,6 @@ struct blk_mq_hw_ctx {
 	unsigned int 		nr_ctx_map;
 	unsigned long		*ctx_map;
 
-	struct request		**rqs;
-	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
 	unsigned long		queued;
@@ -42,7 +40,6 @@ struct blk_mq_hw_ctx {
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
-	unsigned int		queue_depth;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
@@ -50,7 +47,7 @@ struct blk_mq_hw_ctx {
 	struct kobject		kobj;
 };
 
-struct blk_mq_reg {
+struct blk_mq_tag_set {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
 	unsigned int		queue_depth;
@@ -59,18 +56,22 @@ struct blk_mq_reg {
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;		/* BLK_MQ_F_* */
+	void			*driver_data;
+
+	struct blk_mq_tags	**tags;
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
+typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *,
+		unsigned int);
 typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
-typedef int (init_request_fn)(void *, struct blk_mq_hw_ctx *,
-		struct request *, unsigned int);
-typedef void (exit_request_fn)(void *, struct blk_mq_hw_ctx *,
-		struct request *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+		unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+		unsigned int);
 
 struct blk_mq_ops {
 	/*
@@ -127,10 +128,13 @@ enum {
 	BLK_MQ_MAX_DEPTH	= 2048,
 };
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
 
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
+
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
@@ -139,10 +143,10 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int);
 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
 bool blk_mq_end_io_partial(struct request *rq, int error,
@@ -173,12 +177,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return (void *) rq + sizeof(*rq);
 }
 
-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-					       unsigned int tag)
-{
-	return hctx->rqs[tag];
-}
-
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
 	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
-- 
cgit v1.2.3


From 63151a449ebaef062ffac5b302206565ff5ef62e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 09:44:52 +0200
Subject: blk-mq: allow drivers to hook into I/O completion

Split out the bottom half of blk_mq_end_io so that drivers can perform
work when they know a request has been completed, but before it has been
freed.  This also obsoletes blk_mq_end_io_partial as drivers can now
pass any value to blk_update_request directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 16 ++++++++++------
 include/linux/blk-mq.h |  9 ++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b59a8d027dff..86d66e0e900c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -294,20 +294,24 @@ void blk_mq_clone_flush_request(struct request *flush_rq,
 		hctx->cmd_size);
 }
 
-bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes)
+inline void __blk_mq_end_io(struct request *rq, int error)
 {
-	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
-		return true;
-
 	blk_account_io_done(rq);
 
 	if (rq->end_io)
 		rq->end_io(rq, error);
 	else
 		blk_mq_free_request(rq);
-	return false;
 }
-EXPORT_SYMBOL(blk_mq_end_io_partial);
+EXPORT_SYMBOL(__blk_mq_end_io);
+
+void blk_mq_end_io(struct request *rq, int error)
+{
+	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
+		BUG();
+	__blk_mq_end_io(rq, error);
+}
+EXPORT_SYMBOL(blk_mq_end_io);
 
 static void __blk_mq_complete_request_remote(void *data)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a4ea0ce83b07..a81b474b794f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -149,13 +149,8 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int);
 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
-bool blk_mq_end_io_partial(struct request *rq, int error,
-		unsigned int nr_bytes);
-static inline void blk_mq_end_io(struct request *rq, int error)
-{
-	bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
-	BUG_ON(!done);
-}
+void blk_mq_end_io(struct request *rq, int error);
+void __blk_mq_end_io(struct request *rq, int error);
 
 void blk_mq_complete_request(struct request *rq);
 
-- 
cgit v1.2.3


From 1b4a325858f695a9b5041313602d34b36f463724 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 09:44:54 +0200
Subject: blk-mq: add async parameter to blk_mq_start_stopped_hw_queues

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c             | 4 ++--
 drivers/block/virtio_blk.c | 4 ++--
 include/linux/blk-mq.h     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 963a82109386..da3808823e44 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -700,7 +700,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
-void blk_mq_start_stopped_hw_queues(struct request_queue *q)
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -711,7 +711,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 		preempt_disable();
-		blk_mq_run_hw_queue(hctx, true);
+		blk_mq_run_hw_queue(hctx, async);
 		preempt_enable();
 	}
 }
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f909a8821e65..7a51f065edcd 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -151,7 +151,7 @@ static void virtblk_done(struct virtqueue *vq)
 
 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
-		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -762,7 +762,7 @@ static int virtblk_restore(struct virtio_device *vdev)
 	vblk->config_enable = true;
 	ret = init_vq(vdev->priv);
 	if (!ret)
-		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
 
 	return ret;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a81b474b794f..9ecfab96d8c9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -157,7 +157,7 @@ void blk_mq_complete_request(struct request *rq);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-void blk_mq_start_stopped_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 
 /*
  * Driver command data is immediately after the request. So subtract request
-- 
cgit v1.2.3


From 70f4db639c5b2479e08657392cbf3ba3cceea11c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 10:48:08 -0600
Subject: blk-mq: add blk_mq_delay_queue

Add a blk-mq equivalent to blk_delay_queue so that the scsi layer can ask
to be kicked again after a delay.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Modified by me to kill the unnecessary preempt disable/enable
in the delayed workqueue handler.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  6 ++++--
 block/blk-mq.c         | 45 +++++++++++++++++++++++++++++++++++++++------
 include/linux/blk-mq.h |  4 +++-
 3 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index ae6227fd07aa..90b6e63b8769 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
-		queue_for_each_hw_ctx(q, hctx, i)
-			cancel_delayed_work_sync(&hctx->delayed_work);
+		queue_for_each_hw_ctx(q, hctx, i) {
+			cancel_delayed_work_sync(&hctx->run_work);
+			cancel_delayed_work_sync(&hctx->delay_work);
+		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index da3808823e44..0cf52dddfa6b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -640,7 +640,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 		__blk_mq_run_hw_queue(hctx);
 	else if (hctx->queue->nr_hw_queues == 1)
-		kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
+		kblockd_schedule_delayed_work(&hctx->run_work, 0);
 	else {
 		unsigned int cpu;
 
@@ -651,7 +651,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		 * just queue on the first CPU.
 		 */
 		cpu = cpumask_first(hctx->cpumask);
-		kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
+		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 	}
 }
 
@@ -675,7 +675,8 @@ EXPORT_SYMBOL(blk_mq_run_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_delayed_work(&hctx->delayed_work);
+	cancel_delayed_work(&hctx->run_work);
+	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -717,15 +718,46 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
-static void blk_mq_work_fn(struct work_struct *work)
+static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 
 	__blk_mq_run_hw_queue(hctx);
 }
 
+static void blk_mq_delay_work_fn(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+
+	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
+		__blk_mq_run_hw_queue(hctx);
+}
+
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+	unsigned long tmo = msecs_to_jiffies(msecs);
+
+	if (hctx->queue->nr_hw_queues == 1)
+		kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
+	else {
+		unsigned int cpu;
+
+		/*
+		 * It'd be great if the workqueue API had a way to pass
+		 * in a mask and had some smarts for more clever placement
+		 * than the first CPU. Or we could round-robin here. For now,
+		 * just queue on the first CPU.
+		 */
+		cpu = cpumask_first(hctx->cpumask);
+		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
+	}
+}
+EXPORT_SYMBOL(blk_mq_delay_queue);
+
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 				    struct request *rq, bool at_head)
 {
@@ -1179,7 +1211,8 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		if (node == NUMA_NO_NODE)
 			node = hctx->numa_node = set->numa_node;
 
-		INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
+		INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+		INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 		spin_lock_init(&hctx->lock);
 		INIT_LIST_HEAD(&hctx->dispatch);
 		hctx->queue = q;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9ecfab96d8c9..ae868e77bc2f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -18,7 +18,8 @@ struct blk_mq_hw_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long		state;		/* BLK_MQ_S_* flags */
-	struct delayed_work	delayed_work;
+	struct delayed_work	run_work;
+	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
@@ -158,6 +159,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 
 /*
  * Driver command data is immediately after the request. So subtract request
-- 
cgit v1.2.3


From 2f268556567ebeb3538f99b9bdad177581439dcb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 09:44:56 +0200
Subject: blk-mq: add blk_mq_start_hw_queues

Add a helper to unconditionally kick contexts of a queue.  This will
be needed by the SCSI layer to provide fair queueing between multiple
devices on a single host.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 11 +++++++++++
 include/linux/blk-mq.h |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0cf52dddfa6b..543bbc08a261 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -701,6 +701,17 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
+void blk_mq_start_hw_queues(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_start_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_start_hw_queues);
+
+
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ae868e77bc2f..391377e53367 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -158,6 +158,7 @@ void blk_mq_complete_request(struct request *rq);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
+void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 
-- 
cgit v1.2.3


From ed0791b2f83cec4e77d88c4e9baabcebf9254a78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 09:44:57 +0200
Subject: blk-mq: add blk_mq_requeue_request

This allows to requeue a request that has been accepted by ->queue_rq
earlier.  This is needed by the SCSI layer in various error conditions.

The existing internal blk_mq_requeue_request is renamed to
__blk_mq_requeue_request as it is a lower level building block for this
funtionality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 18 ++++++++++++++++--
 include/linux/blk-mq.h |  2 ++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 543bbc08a261..ee225cc312b8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -400,7 +400,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
 		rq->cmd_flags |= REQ_END;
 }
 
-static void blk_mq_requeue_request(struct request *rq)
+static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
@@ -413,6 +413,20 @@ static void blk_mq_requeue_request(struct request *rq)
 		rq->nr_phys_segments--;
 }
 
+void blk_mq_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+
+	__blk_mq_requeue_request(rq);
+	blk_clear_rq_complete(rq);
+
+	trace_block_rq_requeue(q, rq);
+
+	BUG_ON(blk_queued_rq(rq));
+	blk_mq_insert_request(rq, true, true, false);
+}
+EXPORT_SYMBOL(blk_mq_requeue_request);
+
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	return tags->rqs[tag];
@@ -602,7 +616,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 			 * time
 			 */
 			list_add(&rq->queuelist, &rq_list);
-			blk_mq_requeue_request(rq);
+			__blk_mq_requeue_request(rq);
 			break;
 		default:
 			pr_err("blk-mq: bad return on queue: %d\n", ret);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 391377e53367..ab469d525894 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -153,6 +153,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 void blk_mq_end_io(struct request *rq, int error);
 void __blk_mq_end_io(struct request *rq, int error);
 
+void blk_mq_requeue_request(struct request *rq);
+
 void blk_mq_complete_request(struct request *rq);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 38535201633077cbaf8b32886b5e3005b36c9024 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Apr 2014 02:32:53 -0700
Subject: blk-mq: respect rq_affinity

The blk-mq code is using it's own version of the I/O completion affinity
tunables, which causes a few issues:

 - the rq_affinity sysfs file doesn't work for blk-mq devices, even if it
   still is present, thus breaking existing tuning setups.
 - the rq_affinity = 1 mode, which is the defauly for legacy request based
   drivers isn't implemented at all.
 - blk-mq drivers don't implement any completion affinity with the default
   flag settings.

This patches removes the blk-mq ipi_redirect flag and sysfs file, as well
as the internal BLK_MQ_F_SHOULD_IPI flag and replaces it with code that
respects the queue-wide rq_affinity flags and also implements the
rq_affinity = 1 mode.

This means I/O completion affinity can now only be tuned block-queue wide
instead of per context, which seems more sensible to me anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c   | 42 ------------------------------------------
 block/blk-mq.c         |  8 ++++++--
 block/blk-mq.h         |  1 -
 include/linux/blk-mq.h |  1 -
 4 files changed, 6 insertions(+), 46 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 9176a6984857..8145b5b25b4b 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,42 +203,6 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	ssize_t ret;
-
-	spin_lock(&hctx->lock);
-	ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
-	spin_unlock(&hctx->lock);
-
-	return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
-					 const char *page, size_t len)
-{
-	struct blk_mq_ctx *ctx;
-	unsigned long ret;
-	unsigned int i;
-
-	if (kstrtoul(page, 10, &ret)) {
-		pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
-		return -EINVAL;
-	}
-
-	spin_lock(&hctx->lock);
-	if (ret)
-		hctx->flags |= BLK_MQ_F_SHOULD_IPI;
-	else
-		hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
-	spin_unlock(&hctx->lock);
-
-	hctx_for_each_ctx(hctx, ctx, i)
-		ctx->ipi_redirect = !!ret;
-
-	return len;
-}
-
 static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
 	return blk_mq_tag_sysfs_show(hctx->tags, page);
@@ -307,11 +271,6 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
 	.attr = {.name = "pending", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_rq_list_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
-	.attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
-	.show = blk_mq_hw_sysfs_ipi_show,
-	.store = blk_mq_hw_sysfs_ipi_store,
-};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
 	.attr = {.name = "tags", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_tags_show,
@@ -326,7 +285,6 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_run.attr,
 	&blk_mq_hw_sysfs_dispatched.attr,
 	&blk_mq_hw_sysfs_pending.attr,
-	&blk_mq_hw_sysfs_ipi.attr,
 	&blk_mq_hw_sysfs_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
 	NULL,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a84112c94e74..f2e92eb92803 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -326,15 +326,19 @@ static void __blk_mq_complete_request_remote(void *data)
 void __blk_mq_complete_request(struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	bool shared = false;
 	int cpu;
 
-	if (!ctx->ipi_redirect) {
+	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 		rq->q->softirq_done_fn(rq);
 		return;
 	}
 
 	cpu = get_cpu();
-	if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
+	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+		shared = cpus_share_cache(cpu, ctx->cpu);
+
+	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 		rq->csd.func = __blk_mq_complete_request_remote;
 		rq->csd.info = rq;
 		rq->csd.flags = 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b41a784de50d..1ae364ceaf8b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -11,7 +11,6 @@ struct blk_mq_ctx {
 
 	unsigned int		cpu;
 	unsigned int		index_hw;
-	unsigned int		ipi_redirect;
 
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ab469d525894..3b561d651a02 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -122,7 +122,6 @@ enum {
 
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
-	BLK_MQ_F_SHOULD_IPI	= 1 << 2,
 
 	BLK_MQ_S_STOPPED	= 0,
 
-- 
cgit v1.2.3


From 506e931f92defdc60c1dc4aa2ff4a19a5dcd8618 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 7 May 2014 10:26:44 -0600
Subject: blk-mq: add basic round-robin of what CPU to queue workqueue work on

Right now we just pick the first CPU in the mask, but that can
easily overload that one. Add some basic batching and round-robin
all the entries in the mask instead.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 45 +++++++++++++++++++++++++++++++--------------
 include/linux/blk-mq.h |  4 ++++
 2 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0d379830a278..2410e0cb7aef 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -670,6 +670,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
+/*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	int cpu = hctx->next_cpu;
+
+	if (--hctx->next_cpu_batch <= 0) {
+		int next_cpu;
+
+		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+		if (next_cpu >= nr_cpu_ids)
+			next_cpu = cpumask_first(hctx->cpumask);
+
+		hctx->next_cpu = next_cpu;
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
+
+	return cpu;
+}
+
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
@@ -682,13 +706,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	else {
 		unsigned int cpu;
 
-		/*
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
+		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 	}
 }
@@ -795,13 +813,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 	else {
 		unsigned int cpu;
 
-		/*
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
+		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 	}
 }
@@ -1378,6 +1390,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3b561d651a02..5bd677e2dcb7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -21,6 +21,8 @@ struct blk_mq_hw_ctx {
 	struct delayed_work	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
+	int			next_cpu;
+	int			next_cpu_batch;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
@@ -126,6 +128,8 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 
 	BLK_MQ_MAX_DEPTH	= 2048,
+
+	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
-- 
cgit v1.2.3


From 4bb659b156996f2993dc16fad71fec9ee070153c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 9 May 2014 09:36:49 -0600
Subject: blk-mq: implement new and more efficient tagging scheme

blk-mq currently uses percpu_ida for tag allocation. But that only
works well if the ratio between tag space and number of CPUs is
sufficiently high. For most devices and systems, that is not the
case. The end result if that we either only utilize the tag space
partially, or we end up attempting to fully exhaust it and run
into lots of lock contention with stealing between CPUs. This is
not optimal.

This new tagging scheme is a hybrid bitmap allocator. It uses
two tricks to both be SMP friendly and allow full exhaustion
of the space:

1) We cache the last allocated (or freed) tag on a per blk-mq
   software context basis. This allows us to limit the space
   we have to search. The key element here is not caching it
   in the shared tag structure, otherwise we end up dirtying
   more shared cache lines on each allocate/free operation.

2) The tag space is split into cache line sized groups, and
   each context will start off randomly in that space. Even up
   to full utilization of the space, this divides the tag users
   efficiently into cache line groups, avoiding dirtying the same
   one both between allocators and between allocator and freeer.

This scheme shows drastically better behaviour, both on small
tag spaces but on large ones as well. It has been tested extensively
to show better performance for all the cases blk-mq cares about.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 415 ++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq-tag.h     |  42 ++++-
 block/blk-mq.c         |  23 ++-
 block/blk-mq.h         |   4 +-
 include/linux/blk-mq.h |   6 +-
 5 files changed, 391 insertions(+), 99 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1f43d6ee956f..467f3a20b355 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,64 +1,257 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/random.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved)
+void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx,
+			  bool reserved)
 {
-	int tag = blk_mq_get_tag(tags, __GFP_WAIT, reserved);
-	blk_mq_put_tag(tags, tag);
+	int tag, zero = 0;
+
+	tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved);
+	blk_mq_put_tag(tags, tag, &zero);
+}
+
+static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
+{
+	int i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_mq_bitmap *bm = &bt->map[i];
+		int ret;
+
+		ret = find_first_zero_bit(&bm->word, bm->depth);
+		if (ret < bm->depth)
+			return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
-	return !tags ||
-		percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+	if (!tags)
+		return true;
+
+	return bt_has_free_tags(&tags->bitmap_tags);
+}
+
+static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
+{
+	int tag, org_last_tag, end;
+
+	org_last_tag = last_tag = TAG_TO_BIT(last_tag);
+	end = bm->depth;
+	do {
+restart:
+		tag = find_next_zero_bit(&bm->word, end, last_tag);
+		if (unlikely(tag >= end)) {
+			/*
+			 * We started with an offset, start from 0 to
+			 * exhaust the map.
+			 */
+			if (org_last_tag && last_tag) {
+				end = last_tag;
+				last_tag = 0;
+				goto restart;
+			}
+			return -1;
+		}
+		last_tag = tag + 1;
+	} while (test_and_set_bit_lock(tag, &bm->word));
+
+	return tag;
+}
+
+/*
+ * Straight forward bitmap tag implementation, where each bit is a tag
+ * (cleared == free, and set == busy). The small twist is using per-cpu
+ * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
+ * contexts. This enables us to drastically limit the space searched,
+ * without dirtying an extra shared cacheline like we would if we stored
+ * the cache value inside the shared blk_mq_bitmap_tags structure. On top
+ * of that, each word of tags is in a separate cacheline. This means that
+ * multiple users will tend to stick to different cachelines, at least
+ * until the map is exhausted.
+ */
+static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
+{
+	unsigned int last_tag, org_last_tag;
+	int index, i, tag;
+
+	last_tag = org_last_tag = *tag_cache;
+	index = TAG_TO_INDEX(last_tag);
+
+	for (i = 0; i < bt->map_nr; i++) {
+		tag = __bt_get_word(&bt->map[index], last_tag);
+		if (tag != -1) {
+			tag += index * BITS_PER_LONG;
+			goto done;
+		}
+
+		last_tag = 0;
+		if (++index >= bt->map_nr)
+			index = 0;
+	}
+
+	*tag_cache = 0;
+	return -1;
+
+	/*
+	 * Only update the cache from the allocation path, if we ended
+	 * up using the specific cached tag.
+	 */
+done:
+	if (tag == org_last_tag) {
+		last_tag = tag + 1;
+		if (last_tag >= bt->depth - 1)
+			last_tag = 0;
+
+		*tag_cache = last_tag;
+	}
+
+	return tag;
+}
+
+static inline void bt_index_inc(unsigned int *index)
+{
+	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
+					 struct blk_mq_hw_ctx *hctx)
+{
+	struct bt_wait_state *bs;
+
+	if (!hctx)
+		return &bt->bs[0];
+
+	bs = &bt->bs[hctx->wait_index];
+	bt_index_inc(&hctx->wait_index);
+	return bs;
 }
 
-static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
+		  unsigned int *last_tag, gfp_t gfp)
 {
+	struct bt_wait_state *bs;
+	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ?
-			       TASK_UNINTERRUPTIBLE : TASK_RUNNING);
-	if (tag < 0)
-		return BLK_MQ_TAG_FAIL;
-	return tag + tags->nr_reserved_tags;
+	tag = __bt_get(bt, last_tag);
+	if (tag != -1)
+		return tag;
+
+	if (!(gfp & __GFP_WAIT))
+		return -1;
+
+	bs = bt_wait_ptr(bt, hctx);
+	do {
+		bool was_empty;
+
+		was_empty = list_empty(&wait.task_list);
+		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		tag = __bt_get(bt, last_tag);
+		if (tag != -1)
+			break;
+
+		if (was_empty)
+			atomic_set(&bs->wait_cnt, bt->wake_cnt);
+
+		io_schedule();
+	} while (1);
+
+	finish_wait(&bs->wait, &wait);
+	return tag;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
+				     struct blk_mq_hw_ctx *hctx,
+				     unsigned int *last_tag, gfp_t gfp)
+{
+	int tag;
+
+	tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
+	if (tag >= 0)
+		return tag + tags->nr_reserved_tags;
+
+	return BLK_MQ_TAG_FAIL;
 }
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
 					      gfp_t gfp)
 {
-	int tag;
+	int tag, zero = 0;
 
 	if (unlikely(!tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ?
-			       TASK_UNINTERRUPTIBLE : TASK_RUNNING);
+	tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
+
 	return tag;
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+unsigned int blk_mq_get_tag(struct blk_mq_tags *tags,
+			    struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+			    gfp_t gfp, bool reserved)
 {
 	if (!reserved)
-		return __blk_mq_get_tag(tags, gfp);
+		return __blk_mq_get_tag(tags, hctx, last_tag, gfp);
 
 	return __blk_mq_get_reserved_tag(tags, gfp);
 }
 
+static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+{
+	int i, wake_index;
+
+	wake_index = bt->wake_index;
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait)) {
+			if (wake_index != bt->wake_index)
+				bt->wake_index = wake_index;
+
+			return bs;
+		}
+
+		bt_index_inc(&wake_index);
+	}
+
+	return NULL;
+}
+
+static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+{
+	const int index = TAG_TO_INDEX(tag);
+	struct bt_wait_state *bs;
+
+	clear_bit(TAG_TO_BIT(tag), &bt->map[index].word);
+
+	bs = bt_wake_ptr(bt);
+	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
+		smp_mb__after_clear_bit();
+		atomic_set(&bs->wait_cnt, bt->wake_cnt);
+		bt_index_inc(&bt->wake_index);
+		wake_up(&bs->wait);
+	}
+}
+
 static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
 	BUG_ON(tag >= tags->nr_tags);
 
-	percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+	bt_clear_tag(&tags->bitmap_tags, tag);
 }
 
 static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -66,22 +259,41 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 {
 	BUG_ON(tag >= tags->nr_reserved_tags);
 
-	percpu_ida_free(&tags->reserved_tags, tag);
+	bt_clear_tag(&tags->breserved_tags, tag);
 }
 
-void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag,
+		    unsigned int *last_tag)
 {
-	if (tag >= tags->nr_reserved_tags)
-		__blk_mq_put_tag(tags, tag);
-	else
+	if (tag >= tags->nr_reserved_tags) {
+		const int real_tag = tag - tags->nr_reserved_tags;
+
+		__blk_mq_put_tag(tags, real_tag);
+		*last_tag = real_tag;
+	} else
 		__blk_mq_put_reserved_tag(tags, tag);
 }
 
-static int __blk_mq_tag_iter(unsigned id, void *data)
+static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
+			     unsigned long *free_map, unsigned int off)
 {
-	unsigned long *tag_map = data;
-	__set_bit(id, tag_map);
-	return 0;
+	int i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_mq_bitmap *bm = &bt->map[i];
+		int bit = 0;
+
+		do {
+			bit = find_next_zero_bit(&bm->word, bm->depth, bit);
+			if (bit >= bm->depth)
+				break;
+
+			__set_bit(bit + off, free_map);
+			bit++;
+		} while (1);
+
+		off += BITS_PER_LONG;
+	}
 }
 
 void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -95,21 +307,98 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
 	if (!tag_map)
 		return;
 
-	percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+	bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
 	if (tags->nr_reserved_tags)
-		percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
-			tag_map);
+		bt_for_each_free(&tags->breserved_tags, tag_map, 0);
 
 	fn(data, tag_map);
 	kfree(tag_map);
 }
 
+static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int i, used;
+
+	for (i = 0, used = 0; i < bt->map_nr; i++) {
+		struct blk_mq_bitmap *bm = &bt->map[i];
+
+		used += bitmap_weight(&bm->word, bm->depth);
+	}
+
+	return bt->depth - used;
+}
+
+static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
+			int node, bool reserved)
+{
+	int i;
+
+	/*
+	 * Depth can be zero for reserved tags, that's not a failure
+	 * condition.
+	 */
+	if (depth) {
+		int nr, i, map_depth;
+
+		nr = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+		bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap),
+						GFP_KERNEL, node);
+		if (!bt->map)
+			return -ENOMEM;
+
+		bt->map_nr = nr;
+		map_depth = depth;
+		for (i = 0; i < nr; i++) {
+			bt->map[i].depth = min(map_depth, BITS_PER_LONG);
+			map_depth -= BITS_PER_LONG;
+		}
+	}
+
+	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
+	if (!bt->bs) {
+		kfree(bt->map);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < BT_WAIT_QUEUES; i++)
+		init_waitqueue_head(&bt->bs[i].wait);
+
+	bt->wake_cnt = BT_WAIT_BATCH;
+	if (bt->wake_cnt > depth / 4)
+		bt->wake_cnt = max(1U, depth / 4);
+
+	bt->depth = depth;
+	return 0;
+}
+
+static void bt_free(struct blk_mq_bitmap_tags *bt)
+{
+	kfree(bt->map);
+	kfree(bt->bs);
+}
+
+static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+						   int node)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
+		goto enomem;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
+		goto enomem;
+
+	return tags;
+enomem:
+	bt_free(&tags->bitmap_tags);
+	kfree(tags);
+	return NULL;
+}
+
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 				     unsigned int reserved_tags, int node)
 {
 	unsigned int nr_tags, nr_cache;
 	struct blk_mq_tags *tags;
-	int ret;
 
 	if (total_tags > BLK_MQ_TAG_MAX) {
 		pr_err("blk-mq: tag depth too large\n");
@@ -121,72 +410,46 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 		return NULL;
 
 	nr_tags = total_tags - reserved_tags;
-	nr_cache = nr_tags / num_possible_cpus();
-
-	if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
-		nr_cache = BLK_MQ_TAG_CACHE_MIN;
-	else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
-		nr_cache = BLK_MQ_TAG_CACHE_MAX;
+	nr_cache = nr_tags / num_online_cpus();
 
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
-	tags->nr_max_cache = nr_cache;
-	tags->nr_batch_move = max(1u, nr_cache / 2);
-
-	ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
-				tags->nr_reserved_tags,
-				tags->nr_max_cache,
-				tags->nr_batch_move);
-	if (ret)
-		goto err_free_tags;
-
-	if (reserved_tags) {
-		/*
-		 * With max_cahe and batch set to 1, the allocator fallbacks to
-		 * no cached. It's fine reserved tags allocation is slow.
-		 */
-		ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
-				1, 1);
-		if (ret)
-			goto err_reserved_tags;
-	}
 
-	return tags;
-
-err_reserved_tags:
-	percpu_ida_destroy(&tags->free_tags);
-err_free_tags:
-	kfree(tags);
-	return NULL;
+	return blk_mq_init_bitmap_tags(tags, node);
 }
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	percpu_ida_destroy(&tags->free_tags);
-	percpu_ida_destroy(&tags->reserved_tags);
+	bt_free(&tags->bitmap_tags);
+	bt_free(&tags->breserved_tags);
 	kfree(tags);
 }
 
+void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	if (depth > 1)
+		*tag = prandom_u32() % (depth - 1);
+	else
+		*tag = 0;
+}
+
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
 	char *orig_page = page;
-	unsigned int cpu;
+	unsigned int free, res;
 
 	if (!tags)
 		return 0;
 
-	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
-			" max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
-			tags->nr_batch_move, tags->nr_max_cache);
+	page += sprintf(page, "nr_tags=%u, reserved_tags=%u\n",
+			tags->nr_tags, tags->nr_reserved_tags);
 
-	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
-			percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
-			percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+	free = bt_unused_tags(&tags->bitmap_tags);
+	res = bt_unused_tags(&tags->breserved_tags);
 
-	for_each_possible_cpu(cpu) {
-		page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
-				percpu_ida_free_tags(&tags->free_tags, cpu));
-	}
+	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
 
 	return page - orig_page;
 }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index c8e0645ea331..06d4a2f0f7a0 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,7 +1,34 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
 
-#include <linux/percpu_ida.h>
+enum {
+	BT_WAIT_QUEUES	= 8,
+	BT_WAIT_BATCH	= 8,
+};
+
+struct bt_wait_state {
+	atomic_t wait_cnt;
+	wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+#define TAG_TO_INDEX(tag)	((tag) / BITS_PER_LONG)
+#define TAG_TO_BIT(tag)		((tag) & (BITS_PER_LONG - 1))
+
+struct blk_mq_bitmap {
+	unsigned long word;
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
+struct blk_mq_bitmap_tags {
+	unsigned int depth;
+	unsigned int wake_cnt;
+
+	struct blk_mq_bitmap *map;
+	unsigned int map_nr;
+
+	unsigned int wake_index;
+	struct bt_wait_state *bs;
+};
 
 /*
  * Tag address space map.
@@ -9,11 +36,9 @@
 struct blk_mq_tags {
 	unsigned int nr_tags;
 	unsigned int nr_reserved_tags;
-	unsigned int nr_batch_move;
-	unsigned int nr_max_cache;
 
-	struct percpu_ida free_tags;
-	struct percpu_ida reserved_tags;
+	struct blk_mq_bitmap_tags bitmap_tags;
+	struct blk_mq_bitmap_tags breserved_tags;
 
 	struct request **rqs;
 	struct list_head page_list;
@@ -23,12 +48,13 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
-extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
+extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
+extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved);
+extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag);
 extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
 
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 492f49f96459..9f07a266f7ab 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -74,12 +74,13 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 }
 
 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
+					      struct blk_mq_ctx *ctx,
 					      gfp_t gfp, bool reserved)
 {
 	struct request *rq;
 	unsigned int tag;
 
-	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
+	tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
 		rq = hctx->tags->rqs[tag];
 		rq->tag = tag;
@@ -246,7 +247,8 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+		rq = __blk_mq_alloc_request(hctx, ctx, gfp & ~__GFP_WAIT,
+						reserved);
 		if (rq) {
 			blk_mq_rq_ctx_init(q, ctx, rq, rw);
 			break;
@@ -260,7 +262,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 			break;
 		}
 
-		blk_mq_wait_for_tags(hctx->tags, reserved);
+		blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
 	} while (1);
 
 	return rq;
@@ -278,6 +280,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
 		blk_mq_put_ctx(rq->mq_ctx);
 	return rq;
 }
+EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
 					      gfp_t gfp)
@@ -301,7 +304,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	struct request_queue *q = rq->q;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	blk_mq_put_tag(hctx->tags, tag);
+	blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
 	blk_mq_queue_exit(q);
 }
 
@@ -677,11 +680,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 			queued++;
 			continue;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			/*
-			 * FIXME: we should have a mechanism to stop the queue
-			 * like blk_stop_queue, otherwise we will waste cpu
-			 * time
-			 */
 			list_add(&rq->queuelist, &rq_list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -873,6 +871,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 		list_add(&rq->queuelist, &ctx->rq_list);
 	else
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
+
 	blk_mq_hctx_mark_pending(hctx, ctx);
 
 	/*
@@ -1046,7 +1045,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (is_sync)
 		rw |= REQ_SYNC;
 	trace_block_getrq(q, bio, rw);
-	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+	rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false);
 	if (likely(rq))
 		blk_mq_rq_ctx_init(q, ctx, rq, rw);
 	else {
@@ -1130,8 +1129,8 @@ EXPORT_SYMBOL(blk_mq_map_queue);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
 						   unsigned int hctx_index)
 {
-	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
-				GFP_KERNEL | __GFP_ZERO, set->numa_node);
+	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
+				set->numa_node);
 }
 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1ae364ceaf8b..97cfab9c092f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,6 +12,8 @@ struct blk_mq_ctx {
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
+	unsigned int		last_tag ____cacheline_aligned_in_smp;
+
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
 	unsigned long		rq_merged;
@@ -21,7 +23,7 @@ struct blk_mq_ctx {
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
-};
+} ____cacheline_aligned_in_smp;
 
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5bd677e2dcb7..f83d15f6e1c1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -31,10 +31,12 @@ struct blk_mq_hw_ctx {
 
 	void			*driver_data;
 
-	unsigned int		nr_ctx;
-	struct blk_mq_ctx	**ctxs;
 	unsigned int 		nr_ctx_map;
 	unsigned long		*ctx_map;
+	unsigned int		nr_ctx;
+	struct blk_mq_ctx	**ctxs;
+
+	unsigned int		wait_index;
 
 	struct blk_mq_tags	*tags;
 
-- 
cgit v1.2.3


From 0d2602ca30e410e84e8bdf05c84ed5688e0a5a44 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 13 May 2014 15:10:52 -0600
Subject: blk-mq: improve support for shared tags maps

This adds support for active queue tracking, meaning that the
blk-mq tagging maintains a count of active users of a tag set.
This allows us to maintain a notion of fairness between users,
so that we can distribute the tag depth evenly without starving
some users while allowing others to try unfair deep queues.

If sharing of a tag set is detected, each hardware queue will
track the depth of its own queue. And if this exceeds the total
depth divided by the number of active queues, the user is actively
throttled down.

The active queue count is done lazily to avoid bouncing that data
between submitter and completer. Each hardware queue gets marked
active when it allocates its first tag, and gets marked inactive
when 1) the last tag is cleared, and 2) the queue timeout grace
period has passed.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c      |  10 +++++
 block/blk-mq-tag.c        | 112 +++++++++++++++++++++++++++++++++++++++-------
 block/blk-mq-tag.h        |  27 +++++++++--
 block/blk-mq.c            |  85 ++++++++++++++++++++++++++++++++---
 block/blk-timeout.c       |  13 +++++-
 block/blk.h               |   4 ++
 include/linux/blk-mq.h    |   7 +++
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h    |   3 ++
 9 files changed, 236 insertions(+), 27 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 8145b5b25b4b..99a60a829e69 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -208,6 +208,11 @@ static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return blk_mq_tag_sysfs_show(hctx->tags, page);
 }
 
+static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+}
+
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
 	unsigned int i, first = 1;
@@ -267,6 +272,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
 	.attr = {.name = "dispatched", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_dispatched_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
+	.attr = {.name = "active", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_active_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
 	.attr = {.name = "pending", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_rq_list_show,
@@ -287,6 +296,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_pending.attr,
 	&blk_mq_hw_sysfs_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
+	&blk_mq_hw_sysfs_active.attr,
 	NULL,
 };
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 8d526a3e02f6..c80086c9c064 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -7,13 +7,12 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx,
-			  bool reserved)
+void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved)
 {
 	int tag, zero = 0;
 
-	tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved);
-	blk_mq_put_tag(tags, tag, &zero);
+	tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved);
+	blk_mq_put_tag(hctx, tag, &zero);
 }
 
 static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
@@ -40,6 +39,84 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 	return bt_has_free_tags(&tags->bitmap_tags);
 }
 
+static inline void bt_index_inc(unsigned int *index)
+{
+	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+/*
+ * If a previously inactive queue goes active, bump the active user count.
+ */
+bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		atomic_inc(&hctx->tags->active_queues);
+
+	return true;
+}
+
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+	struct blk_mq_bitmap_tags *bt;
+	int i, wake_index;
+
+	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return;
+
+	atomic_dec(&tags->active_queues);
+
+	/*
+	 * Will only throttle depth on non-reserved tags
+	 */
+	bt = &tags->bitmap_tags;
+	wake_index = bt->wake_index;
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait))
+			wake_up(&bs->wait);
+
+		bt_index_inc(&wake_index);
+	}
+}
+
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+				  struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int depth, users;
+
+	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return true;
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return true;
+
+	/*
+	 * Don't try dividing an ant
+	 */
+	if (bt->depth == 1)
+		return true;
+
+	users = atomic_read(&hctx->tags->active_queues);
+	if (!users)
+		return true;
+
+	/*
+	 * Allow at least some tags
+	 */
+	depth = max((bt->depth + users - 1) / users, 4U);
+	return atomic_read(&hctx->nr_active) < depth;
+}
+
 static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
 {
 	int tag, org_last_tag, end;
@@ -78,11 +155,15 @@ restart:
  * multiple users will tend to stick to different cachelines, at least
  * until the map is exhausted.
  */
-static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+		    unsigned int *tag_cache)
 {
 	unsigned int last_tag, org_last_tag;
 	int index, i, tag;
 
+	if (!hctx_may_queue(hctx, bt))
+		return -1;
+
 	last_tag = org_last_tag = *tag_cache;
 	index = TAG_TO_INDEX(bt, last_tag);
 
@@ -117,11 +198,6 @@ done:
 	return tag;
 }
 
-static inline void bt_index_inc(unsigned int *index)
-{
-	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
 static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
 					 struct blk_mq_hw_ctx *hctx)
 {
@@ -142,7 +218,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(bt, last_tag);
+	tag = __bt_get(hctx, bt, last_tag);
 	if (tag != -1)
 		return tag;
 
@@ -156,7 +232,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
 		was_empty = list_empty(&wait.task_list);
 		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(bt, last_tag);
+		tag = __bt_get(hctx, bt, last_tag);
 		if (tag != -1)
 			break;
 
@@ -200,14 +276,13 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
 	return tag;
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_tags *tags,
-			    struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
 			    gfp_t gfp, bool reserved)
 {
 	if (!reserved)
-		return __blk_mq_get_tag(tags, hctx, last_tag, gfp);
+		return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
 
-	return __blk_mq_get_reserved_tag(tags, gfp);
+	return __blk_mq_get_reserved_tag(hctx->tags, gfp);
 }
 
 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
@@ -265,9 +340,11 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 	bt_clear_tag(&tags->breserved_tags, tag);
 }
 
-void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag,
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		    unsigned int *last_tag)
 {
+	struct blk_mq_tags *tags = hctx->tags;
+
 	if (tag >= tags->nr_reserved_tags) {
 		const int real_tag = tag - tags->nr_reserved_tags;
 
@@ -465,6 +542,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 	res = bt_unused_tags(&tags->breserved_tags);
 
 	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
+	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
 
 	return page - orig_page;
 }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7aa9f0665489..0f5ec8b50ef3 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -38,6 +38,8 @@ struct blk_mq_tags {
 	unsigned int nr_tags;
 	unsigned int nr_reserved_tags;
 
+	atomic_t active_queues;
+
 	struct blk_mq_bitmap_tags bitmap_tags;
 	struct blk_mq_bitmap_tags breserved_tags;
 
@@ -49,9 +51,9 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
-extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag);
+extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
+extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
 extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
@@ -68,4 +70,23 @@ enum {
 	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
 };
 
+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return false;
+
+	return __blk_mq_tag_busy(hctx);
+}
+
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return;
+
+	__blk_mq_tag_idle(hctx);
+}
+
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9f07a266f7ab..3c4f1fceef8e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -80,9 +80,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
 	struct request *rq;
 	unsigned int tag;
 
-	tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
+	tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
 		rq = hctx->tags->rqs[tag];
+
+		rq->cmd_flags = 0;
+		if (blk_mq_tag_busy(hctx)) {
+			rq->cmd_flags = REQ_MQ_INFLIGHT;
+			atomic_inc(&hctx->nr_active);
+		}
+
 		rq->tag = tag;
 		return rq;
 	}
@@ -190,7 +197,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
-	rq->cmd_flags = rw_flags;
+	rq->cmd_flags |= rw_flags;
 	rq->cmd_type = 0;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
@@ -262,7 +269,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 			break;
 		}
 
-		blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
+		blk_mq_wait_for_tags(hctx, reserved);
 	} while (1);
 
 	return rq;
@@ -303,8 +310,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
+	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+		atomic_dec(&hctx->nr_active);
+
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
+	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
 	blk_mq_queue_exit(q);
 }
 
@@ -571,8 +581,13 @@ static void blk_mq_rq_timer(unsigned long data)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
 
-	if (next_set)
-		mod_timer(&q->timeout, round_jiffies_up(next));
+	if (next_set) {
+		next = blk_rq_timeout(round_jiffies_up(next));
+		mod_timer(&q->timeout, next);
+	} else {
+		queue_for_each_hw_ctx(q, hctx, i)
+			blk_mq_tag_idle(hctx);
+	}
 }
 
 /*
@@ -1439,6 +1454,56 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 }
 
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	bool shared;
+	int i;
+
+	if (set->tag_list.next == set->tag_list.prev)
+		shared = false;
+	else
+		shared = true;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		blk_mq_freeze_queue(q);
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (shared)
+				hctx->flags |= BLK_MQ_F_TAG_SHARED;
+			else
+				hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+		}
+		blk_mq_unfreeze_queue(q);
+	}
+}
+
+static void blk_mq_del_queue_tag_set(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	blk_mq_freeze_queue(q);
+
+	mutex_lock(&set->tag_list_lock);
+	list_del_init(&q->tag_set_list);
+	blk_mq_update_tag_set_depth(set);
+	mutex_unlock(&set->tag_list_lock);
+
+	blk_mq_unfreeze_queue(q);
+}
+
+static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
+				     struct request_queue *q)
+{
+	q->tag_set = set;
+
+	mutex_lock(&set->tag_list_lock);
+	list_add_tail(&q->tag_set_list, &set->tag_list);
+	blk_mq_update_tag_set_depth(set);
+	mutex_unlock(&set->tag_list_lock);
+}
+
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx **hctxs;
@@ -1464,6 +1529,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 		if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
 			goto err_hctxs;
 
+		atomic_set(&hctxs[i]->nr_active, 0);
 		hctxs[i]->numa_node = NUMA_NO_NODE;
 		hctxs[i]->queue_num = i;
 	}
@@ -1516,6 +1582,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	list_add_tail(&q->all_q_node, &all_q_list);
 	mutex_unlock(&all_q_mutex);
 
+	blk_mq_add_queue_tag_set(set, q);
+
 	return q;
 
 err_flush_rq:
@@ -1543,6 +1611,8 @@ void blk_mq_free_queue(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
+	blk_mq_del_queue_tag_set(q);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
@@ -1635,6 +1705,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 			goto out_unwind;
 	}
 
+	mutex_init(&set->tag_list_lock);
+	INIT_LIST_HEAD(&set->tag_list);
+
 	return 0;
 
 out_unwind:
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 448745683d28..43e8b515806f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -166,6 +166,17 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
+unsigned long blk_rq_timeout(unsigned long timeout)
+{
+	unsigned long maxt;
+
+	maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+	if (time_after(timeout, maxt))
+		timeout = maxt;
+
+	return timeout;
+}
+
 /**
  * blk_add_timer - Start timeout timer for a single request
  * @req:	request that is about to start running.
@@ -200,7 +211,7 @@ void blk_add_timer(struct request *req)
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = round_jiffies_up(req->deadline);
+	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
 
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index 79be2cbce7fd..95cab70000e3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
 /* Number of requests a "batching" process may submit */
 #define BLK_BATCH_REQ	32
 
+/* Max future timer expiry for timeouts */
+#define BLK_MAX_TIMEOUT		(5 * HZ)
+
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
@@ -37,6 +40,7 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set);
+unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f83d15f6e1c1..379f88d5c44d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -48,6 +48,8 @@ struct blk_mq_hw_ctx {
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
+	atomic_t		nr_active;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 };
@@ -64,6 +66,9 @@ struct blk_mq_tag_set {
 	void			*driver_data;
 
 	struct blk_mq_tags	**tags;
+
+	struct mutex		tag_list_lock;
+	struct list_head	tag_list;
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
@@ -126,8 +131,10 @@ enum {
 
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
+	BLK_MQ_F_TAG_SHARED	= 1 << 2,
 
 	BLK_MQ_S_STOPPED	= 0,
+	BLK_MQ_S_TAG_ACTIVE	= 1,
 
 	BLK_MQ_MAX_DEPTH	= 2048,
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..d8e4cea23a25 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -190,6 +190,7 @@ enum rq_flag_bits {
 	__REQ_PM,		/* runtime pm request */
 	__REQ_END,		/* last of chain of requests */
 	__REQ_HASHED,		/* on IO scheduler merge hash */
+	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -243,5 +244,6 @@ enum rq_flag_bits {
 #define REQ_PM			(1ULL << __REQ_PM)
 #define REQ_END			(1ULL << __REQ_END)
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
+#define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 94b27210641b..6bc011a09e82 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -481,6 +481,9 @@ struct request_queue {
 	wait_queue_head_t	mq_freeze_wq;
 	struct percpu_counter	mq_usage_counter;
 	struct list_head	all_q_node;
+
+	struct blk_mq_tag_set	*tag_set;
+	struct list_head	tag_set_list;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-- 
cgit v1.2.3


From 1429d7c9467e1e3de0b0ff91d7e4d67c1a92f8a3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 19 May 2014 09:23:55 -0600
Subject: blk-mq: switch ctx pending map to the sparser blk_align_bitmap

Each hardware queue has a bitmap of software queues with pending
requests. When new IO is queued on a software queue, the bit is
set, and when IO is pruned on a hardware queue run, the bit is
cleared. This causes a lot of traffic. Switch this from the regular
BITS_PER_LONG bitmap to a sparser layout, similarly to what was
done for blk-mq tagging.

20% performance increase was observed for single threaded IO, and
about 15% performanc increase on multiple threads driving the
same device.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 119 +++++++++++++++++++++++++++++++++++++------------
 include/linux/blk-mq.h |  10 ++++-
 2 files changed, 99 insertions(+), 30 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 526feee31bff..e862c4408427 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -56,21 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	unsigned int i;
 
-	for (i = 0; i < hctx->nr_ctx_map; i++)
-		if (hctx->ctx_map[i])
+	for (i = 0; i < hctx->ctx_map.map_size; i++)
+		if (hctx->ctx_map.map[i].word)
 			return true;
 
 	return false;
 }
 
+static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
+					      struct blk_mq_ctx *ctx)
+{
+	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+}
+
+#define CTX_TO_BIT(hctx, ctx)	\
+	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
+
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	if (!test_bit(ctx->index_hw, hctx->ctx_map))
-		set_bit(ctx->index_hw, hctx->ctx_map);
+	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
+		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+}
+
+static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
+				      struct blk_mq_ctx *ctx)
+{
+	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
@@ -614,6 +633,40 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	return false;
 }
 
+/*
+ * Process software queues that have been marked busy, splicing them
+ * to the for-dispatch
+ */
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+{
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	for (i = 0; i < hctx->ctx_map.map_size; i++) {
+		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
+		unsigned int off, bit;
+
+		if (!bm->word)
+			continue;
+
+		bit = 0;
+		off = i * hctx->ctx_map.bits_per_word;
+		do {
+			bit = find_next_bit(&bm->word, bm->depth, bit);
+			if (bit >= bm->depth)
+				break;
+
+			ctx = hctx->ctxs[bit + off];
+			clear_bit(bit, &bm->word);
+			spin_lock(&ctx->lock);
+			list_splice_tail_init(&ctx->rq_list, list);
+			spin_unlock(&ctx->lock);
+
+			bit++;
+		} while (1);
+	}
+}
+
 /*
  * Run this hardware queue, pulling any software queues mapped to it in.
  * Note that this function currently has various problems around ordering
@@ -623,10 +676,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
-	struct blk_mq_ctx *ctx;
 	struct request *rq;
 	LIST_HEAD(rq_list);
-	int bit, queued;
+	int queued;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 
@@ -638,14 +690,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	/*
 	 * Touch any software queue that has pending entries.
 	 */
-	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
-		clear_bit(bit, hctx->ctx_map);
-		ctx = hctx->ctxs[bit];
-
-		spin_lock(&ctx->lock);
-		list_splice_tail_init(&ctx->rq_list, &rq_list);
-		spin_unlock(&ctx->lock);
-	}
+	flush_busy_ctxs(hctx, &rq_list);
 
 	/*
 	 * If we have previous entries on our dispatch list, grab them
@@ -658,14 +703,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		spin_unlock(&hctx->lock);
 	}
 
-	/*
-	 * Delete and return all entries from our dispatch list
-	 */
-	queued = 0;
-
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
+	queued = 0;
 	while (!list_empty(&rq_list)) {
 		int ret;
 
@@ -1158,7 +1199,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->rq_list)) {
 		list_splice_init(&ctx->rq_list, &tmp);
-		clear_bit(ctx->index_hw, hctx->ctx_map);
+		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 
@@ -1298,6 +1339,34 @@ fail:
 	return NULL;
 }
 
+static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
+{
+	kfree(bitmap->map);
+}
+
+static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
+{
+	unsigned int bpw = 8, total, num_maps, i;
+
+	bitmap->bits_per_word = bpw;
+
+	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
+	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
+					GFP_KERNEL, node);
+	if (!bitmap->map)
+		return -ENOMEM;
+
+	bitmap->map_size = num_maps;
+
+	total = nr_cpu_ids;
+	for (i = 0; i < num_maps; i++) {
+		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
+		total -= bitmap->map[i].depth;
+	}
+
+	return 0;
+}
+
 static int blk_mq_init_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
@@ -1308,7 +1377,6 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 	 * Initialize hardware queues
 	 */
 	queue_for_each_hw_ctx(q, hctx, i) {
-		unsigned int num_maps;
 		int node;
 
 		node = hctx->numa_node;
@@ -1339,13 +1407,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		if (!hctx->ctxs)
 			break;
 
-		num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
-		hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
-						GFP_KERNEL, node);
-		if (!hctx->ctx_map)
+		if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
 			break;
 
-		hctx->nr_ctx_map = num_maps;
 		hctx->nr_ctx = 0;
 
 		if (set->ops->init_hctx &&
@@ -1368,7 +1432,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		kfree(hctx->ctxs);
-		kfree(hctx->ctx_map);
+		blk_mq_free_bitmap(&hctx->ctx_map);
 	}
 
 	return 1;
@@ -1542,7 +1606,6 @@ void blk_mq_free_queue(struct request_queue *q)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f83d15f6e1c1..952e558ee598 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -11,6 +11,12 @@ struct blk_mq_cpu_notifier {
 	void (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
+struct blk_mq_ctxmap {
+	unsigned int map_size;
+	unsigned int bits_per_word;
+	struct blk_align_bitmap *map;
+};
+
 struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
@@ -31,8 +37,8 @@ struct blk_mq_hw_ctx {
 
 	void			*driver_data;
 
-	unsigned int 		nr_ctx_map;
-	unsigned long		*ctx_map;
+	struct blk_mq_ctxmap	ctx_map;
+
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
 
-- 
cgit v1.2.3


From e3a2b3f931f59d5284abd13faf8bded726884ffd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 20 May 2014 11:49:02 -0600
Subject: blk-mq: allow changing of queue depth through sysfs

For request_fn based devices, the block layer exports a 'nr_requests'
file through sysfs to allow adjusting of queue depth on the fly.
Currently this returns -EINVAL for blk-mq, since it's not wired up.
Wire this up for blk-mq, so that it now also always dynamic
adjustments of the allowed queue depth for any given block device
managed by blk-mq.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 41 ++++++++++++++++++++++++++
 block/blk-mq-tag.c     | 80 +++++++++++++++++++++++++++++++++++---------------
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 22 ++++++++++++++
 block/blk-mq.h         |  1 +
 block/blk-sysfs.c      | 45 ++++++----------------------
 block/blk.h            |  2 ++
 include/linux/blk-mq.h |  2 +-
 8 files changed, 134 insertions(+), 60 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index a6bd3e702201..fe81e19099a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -848,6 +848,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 		__freed_request(rl, sync ^ 1);
 }
 
+int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+	struct request_list *rl;
+
+	spin_lock_irq(q->queue_lock);
+	q->nr_requests = nr;
+	blk_queue_congestion_threshold(q);
+
+	/* congestion isn't cgroup aware and follows root blkcg for now */
+	rl = &q->root_rl;
+
+	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_SYNC);
+	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_ASYNC);
+	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_SYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_SYNC);
+			wake_up(&rl->wait[BLK_RW_SYNC]);
+		}
+
+		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_ASYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_ASYNC);
+			wake_up(&rl->wait[BLK_RW_ASYNC]);
+		}
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
 /*
  * Determine if elevator data should be initialized when allocating the
  * request associated with @bio.
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e6b3fbae9862..f6dea968b710 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -57,23 +57,13 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 }
 
 /*
- * If a previously busy queue goes inactive, potential waiters could now
- * be allowed to queue. Wake them up and check.
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
  */
-void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 {
-	struct blk_mq_tags *tags = hctx->tags;
 	struct blk_mq_bitmap_tags *bt;
 	int i, wake_index;
 
-	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		return;
-
-	atomic_dec(&tags->active_queues);
-
-	/*
-	 * Will only throttle depth on non-reserved tags
-	 */
 	bt = &tags->bitmap_tags;
 	wake_index = bt->wake_index;
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
@@ -86,6 +76,22 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return;
+
+	atomic_dec(&tags->active_queues);
+
+	blk_mq_tag_wakeup_all(tags);
+}
+
 /*
  * For shared tag users, we track the number of currently active users
  * and attempt to provide a fair share of the tag depth for each of them.
@@ -408,6 +414,28 @@ static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
 	return bt->depth - used;
 }
 
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+			    unsigned int depth)
+{
+	unsigned int tags_per_word = 1U << bt->bits_per_word;
+	unsigned int map_depth = depth;
+
+	if (depth) {
+		int i;
+
+		for (i = 0; i < bt->map_nr; i++) {
+			bt->map[i].depth = min(map_depth, tags_per_word);
+			map_depth -= bt->map[i].depth;
+		}
+	}
+
+	bt->wake_cnt = BT_WAIT_BATCH;
+	if (bt->wake_cnt > depth / 4)
+		bt->wake_cnt = max(1U, depth / 4);
+
+	bt->depth = depth;
+}
+
 static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 			int node, bool reserved)
 {
@@ -420,7 +448,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 	 * condition.
 	 */
 	if (depth) {
-		unsigned int nr, i, map_depth, tags_per_word;
+		unsigned int nr, tags_per_word;
 
 		tags_per_word = (1 << bt->bits_per_word);
 
@@ -444,11 +472,6 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 			return -ENOMEM;
 
 		bt->map_nr = nr;
-		map_depth = depth;
-		for (i = 0; i < nr; i++) {
-			bt->map[i].depth = min(map_depth, tags_per_word);
-			map_depth -= tags_per_word;
-		}
 	}
 
 	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
@@ -460,11 +483,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
 	for (i = 0; i < BT_WAIT_QUEUES; i++)
 		init_waitqueue_head(&bt->bs[i].wait);
 
-	bt->wake_cnt = BT_WAIT_BATCH;
-	if (bt->wake_cnt > depth / 4)
-		bt->wake_cnt = max(1U, depth / 4);
-
-	bt->depth = depth;
+	bt_update_count(bt, depth);
 	return 0;
 }
 
@@ -525,6 +544,21 @@ void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
 	*tag = prandom_u32() % depth;
 }
 
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+{
+	tdepth -= tags->nr_reserved_tags;
+	if (tdepth > tags->nr_tags)
+		return -EINVAL;
+
+	/*
+	 * Don't need (or can't) update reserved tags here, they remain
+	 * static and should never need resizing.
+	 */
+	bt_update_count(&tags->bitmap_tags, tdepth);
+	blk_mq_tag_wakeup_all(tags);
+	return 0;
+}
+
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
 	char *orig_page = page;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index e144f68ec45f..e7ff5ceeeb97 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -55,6 +55,7 @@ extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0fbef7e9bef1..7b71ab1b1536 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1789,6 +1789,28 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret;
+
+	if (!set || nr > set->queue_depth)
+		return -EINVAL;
+
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		if (ret)
+			break;
+	}
+
+	if (!ret)
+		q->nr_requests = nr;
+
+	return ret;
+}
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5e5a378962b7..7db4fe4bd002 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -32,6 +32,7 @@ void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
 		struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 
 /*
  * CPU hotplug helpers
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f876dae4..4d6811ac13fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-	struct request_list *rl;
 	unsigned long nr;
-	int ret;
+	int ret, err;
 
-	if (!q->request_fn)
+	if (!q->request_fn && !q->mq_ops)
 		return -EINVAL;
 
 	ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
 
-	spin_lock_irq(q->queue_lock);
-	q->nr_requests = nr;
-	blk_queue_congestion_threshold(q);
-
-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
-
-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
-
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
-
-	blk_queue_for_each_rl(rl, q) {
-		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_SYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_SYNC);
-			wake_up(&rl->wait[BLK_RW_SYNC]);
-		}
-
-		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_ASYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_ASYNC);
-			wake_up(&rl->wait[BLK_RW_ASYNC]);
-		}
-	}
+	if (q->request_fn)
+		err = blk_update_nr_requests(q, nr);
+	else
+		err = blk_mq_update_nr_requests(q, nr);
+
+	if (err)
+		return err;
 
-	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 
diff --git a/block/blk.h b/block/blk.h
index 95cab70000e3..45385e9abf6f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -188,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
+
 /*
  * Contribute to IO statistics IFF:
  *
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a06ca7b5ea05..f45424453338 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -63,7 +63,7 @@ struct blk_mq_hw_ctx {
 struct blk_mq_tag_set {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
-	unsigned int		queue_depth;
+	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
 	unsigned int		cmd_size;	/* per-request extra data */
 	int			numa_node;
-- 
cgit v1.2.3


From e814e71ba4a6e1d7509b0f4b1928365ea650cace Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 21 May 2014 13:59:08 -0600
Subject: blk-mq: allow the hctx cpu hotplug notifier to return errors

Prepare this for the next patch which adds more smarts in the
plugging logic, so that we can save some memory.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-cpu.c     | 12 ++++++++----
 block/blk-mq.c         |  9 +++++----
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  2 +-
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef8643bba..d2c253f71b86 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -18,14 +18,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 {
 	unsigned int cpu = (unsigned long) hcpu;
 	struct blk_mq_cpu_notifier *notify;
+	int ret = NOTIFY_OK;
 
 	raw_spin_lock(&blk_mq_cpu_notify_lock);
 
-	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
-		notify->notify(notify->data, action, cpu);
+	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
+		ret = notify->notify(notify->data, action, cpu);
+		if (ret != NOTIFY_OK)
+			break;
+	}
 
 	raw_spin_unlock(&blk_mq_cpu_notify_lock);
-	return NOTIFY_OK;
+	return ret;
 }
 
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +49,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 }
 
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      void (*fn)(void *, unsigned long, unsigned int),
+			      int (*fn)(void *, unsigned long, unsigned int),
 			      void *data)
 {
 	notifier->notify = fn;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ef7ed5e95d6d..5a3683fc5bdb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1196,8 +1196,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
 }
 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
 
-static void blk_mq_hctx_notify(void *data, unsigned long action,
-			       unsigned int cpu)
+static int blk_mq_hctx_notify(void *data, unsigned long action,
+			      unsigned int cpu)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	struct request_queue *q = hctx->queue;
@@ -1205,7 +1205,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	LIST_HEAD(tmp);
 
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-		return;
+		return NOTIFY_OK;
 
 	/*
 	 * Move ctx entries to new CPU, if this one is going away.
@@ -1220,7 +1220,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	spin_unlock(&ctx->lock);
 
 	if (list_empty(&tmp))
-		return;
+		return NOTIFY_OK;
 
 	ctx = blk_mq_get_ctx(q);
 	spin_lock(&ctx->lock);
@@ -1240,6 +1240,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 
 	blk_mq_run_hw_queue(hctx, true);
 	blk_mq_put_ctx(ctx);
+	return NOTIFY_OK;
 }
 
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7db4fe4bd002..491dbd4e93f5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -39,7 +39,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
  */
 struct blk_mq_cpu_notifier;
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      void (*fn)(void *, unsigned long, unsigned int),
+			      int (*fn)(void *, unsigned long, unsigned int),
 			      void *data);
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f45424453338..4d2800567aad 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,7 +8,7 @@ struct blk_mq_tags;
 struct blk_mq_cpu_notifier {
 	struct list_head list;
 	void *data;
-	void (*notify)(void *data, unsigned long action, unsigned int cpu);
+	int (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
 struct blk_mq_ctxmap {
-- 
cgit v1.2.3


From edf866b3805c5651bf7d035b72dc0190cb6ff4a7 Mon Sep 17 00:00:00 2001
From: Sam Bradshaw <sbradshaw@micron.com>
Date: Fri, 23 May 2014 13:30:16 -0600
Subject: blk-mq: export blk_mq_tag_busy_iter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export the blk-mq in-flight tag iterator for driver consumption.
This is particularly useful in exception paths or SRSI where
in-flight IOs need to be cancelled and/or reissued. The NVMe driver
conversion will use this.

Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 1 +
 block/blk-mq-tag.h     | 1 -
 include/linux/blk-mq.h | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index f6dea968b710..05e2baf4fa0d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -400,6 +400,7 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
 	fn(data, tag_map);
 	kfree(tag_map);
 }
+EXPORT_SYMBOL(blk_mq_tag_busy_iter);
 
 static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
 {
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index e7ff5ceeeb97..2e5e6872d089 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -51,7 +51,6 @@ extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
 extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
 extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
-extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4d2800567aad..f76bb18350af 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -181,6 +181,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 
 /*
  * Driver command data is immediately after the request. So subtract request
-- 
cgit v1.2.3


From f14bbe77a96bb979dc539d8308ee18a9363a544f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 27 May 2014 12:06:53 -0600
Subject: blk-mq: pass in suggested NUMA node to ->alloc_hctx()

Drivers currently have to figure this out on their own, and they
are missing information to do it properly. The ones that did
attempt to do it, do it wrong.

So just pass in the suggested node directly to the alloc
function.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-cpumap.c    | 16 ++++++++++++++++
 block/blk-mq.c           | 26 +++++++++++++++-----------
 block/blk-mq.h           |  1 +
 drivers/block/null_blk.c | 35 +++--------------------------------
 include/linux/blk-mq.h   |  4 ++--
 5 files changed, 37 insertions(+), 45 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5d0f93cf358c..0daacb927be1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -96,3 +96,19 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 	kfree(map);
 	return NULL;
 }
+
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (index == mq_map[i])
+			return cpu_to_node(i);
+	}
+
+	return NUMA_NO_NODE;
+}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e8b5f74dc1a1..30bad930e661 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1297,10 +1297,10 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 EXPORT_SYMBOL(blk_mq_map_queue);
 
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
-						   unsigned int hctx_index)
+						   unsigned int hctx_index,
+						   int node)
 {
-	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
-				set->numa_node);
+	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
 }
 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
 
@@ -1752,6 +1752,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	struct blk_mq_hw_ctx **hctxs;
 	struct blk_mq_ctx *ctx;
 	struct request_queue *q;
+	unsigned int *map;
 	int i;
 
 	ctx = alloc_percpu(struct blk_mq_ctx);
@@ -1764,8 +1765,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (!hctxs)
 		goto err_percpu;
 
+	map = blk_mq_make_queue_map(set);
+	if (!map)
+		goto err_map;
+
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		hctxs[i] = set->ops->alloc_hctx(set, i);
+		int node = blk_mq_hw_queue_to_node(map, i);
+
+		hctxs[i] = set->ops->alloc_hctx(set, i, node);
 		if (!hctxs[i])
 			goto err_hctxs;
 
@@ -1773,7 +1780,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 			goto err_hctxs;
 
 		atomic_set(&hctxs[i]->nr_active, 0);
-		hctxs[i]->numa_node = NUMA_NO_NODE;
+		hctxs[i]->numa_node = node;
 		hctxs[i]->queue_num = i;
 	}
 
@@ -1784,15 +1791,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (percpu_counter_init(&q->mq_usage_counter, 0))
 		goto err_map;
 
-	q->mq_map = blk_mq_make_queue_map(set);
-	if (!q->mq_map)
-		goto err_map;
-
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, 30000);
 
 	q->nr_queues = nr_cpu_ids;
 	q->nr_hw_queues = set->nr_hw_queues;
+	q->mq_map = map;
 
 	q->queue_ctx = ctx;
 	q->queue_hw_ctx = hctxs;
@@ -1844,16 +1848,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 err_flush_rq:
 	kfree(q->flush_rq);
 err_hw:
-	kfree(q->mq_map);
-err_map:
 	blk_cleanup_queue(q);
 err_hctxs:
+	kfree(map);
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (!hctxs[i])
 			break;
 		free_cpumask_var(hctxs[i]->cpumask);
 		set->ops->free_hctx(hctxs[i], i);
 	}
+err_map:
 	kfree(hctxs);
 err_percpu:
 	free_percpu(ctx);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 491dbd4e93f5..ff5e6bf0f691 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,7 @@ void blk_mq_disable_hotplug(void);
  */
 extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
 /*
  * Basic implementation of sparser bitmap, allowing the user to spread
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 8e7e3a0b0d24..4d33c8c25fbf 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -322,39 +322,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 }
 
 static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set,
-		unsigned int hctx_index)
+					     unsigned int hctx_index,
+					     int node)
 {
-	int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes);
-	int tip = (set->nr_hw_queues % nr_online_nodes);
-	int node = 0, i, n;
-
-	/*
-	 * Split submit queues evenly wrt to the number of nodes. If uneven,
-	 * fill the first buckets with one extra, until the rest is filled with
-	 * no extra.
-	 */
-	for (i = 0, n = 1; i < hctx_index; i++, n++) {
-		if (n % b_size == 0) {
-			n = 0;
-			node++;
-
-			tip--;
-			if (!tip)
-				b_size = set->nr_hw_queues / nr_online_nodes;
-		}
-	}
-
-	/*
-	 * A node might not be online, therefore map the relative node id to the
-	 * real node id.
-	 */
-	for_each_online_node(n) {
-		if (!node)
-			break;
-		node--;
-	}
-
-	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
+	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
 }
 
 static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f76bb18350af..afeb93496907 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -80,7 +80,7 @@ struct blk_mq_tag_set {
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
 typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *,
-		unsigned int);
+		unsigned int, int);
 typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -165,7 +165,7 @@ struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, g
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
 void blk_mq_end_io(struct request *rq, int error);
-- 
cgit v1.2.3


From 95f096849932fe5eaa7bfec887530cf556744a76 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 27 May 2014 17:46:48 -0600
Subject: blk-mq: allow non-softirq completions

Right now we export two ways of completing a request:

1) blk_mq_complete_request(). This uses an IPI (if needed) and
   completes through q->softirq_done_fn(). It also works with
   timeouts.

2) blk_mq_end_io(). This completes inline, and ignores any timeout
   state of the request.

Let blk_mq_complete_request() handle non-softirq_done_fn completions
as well, by just completing inline. If a driver has enough completion
ports to place completions correctly, it need not define a
mq_ops->complete() and we can avoid an indirect function call by
doing the completion inline.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 12 +++++++++---
 include/linux/blk-mq.h |  4 ++++
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 30bad930e661..010b878d53b3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -434,10 +434,16 @@ void __blk_mq_complete_request(struct request *rq)
  **/
 void blk_mq_complete_request(struct request *rq)
 {
-	if (unlikely(blk_should_fake_timeout(rq->q)))
+	struct request_queue *q = rq->q;
+
+	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq))
-		__blk_mq_complete_request(rq);
+	if (!blk_mark_rq_complete(rq)) {
+		if (q->softirq_done_fn)
+			__blk_mq_complete_request(rq);
+		else
+			blk_mq_end_io(rq, rq->errors);
+	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index afeb93496907..1dfeb1529a61 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -173,6 +173,10 @@ void __blk_mq_end_io(struct request *rq, int error);
 
 void blk_mq_requeue_request(struct request *rq);
 
+/*
+ * Complete request through potential IPI for right placement. Driver must
+ * have defined a mq_ops->complete() hook for this.
+ */
 void blk_mq_complete_request(struct request *rq);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 7738dac4f697ffbd0ed4c4aeb69a714ef9d876da Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 28 May 2014 08:06:34 -0600
Subject: blk-mq: remove stale comment for blk_mq_complete_request()

It works for both IPI and local completions as of commit
95f096849932.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1dfeb1529a61..5b171fbe95c5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -172,11 +172,6 @@ void blk_mq_end_io(struct request *rq, int error);
 void __blk_mq_end_io(struct request *rq, int error);
 
 void blk_mq_requeue_request(struct request *rq);
-
-/*
- * Complete request through potential IPI for right placement. Driver must
- * have defined a mq_ops->complete() hook for this.
- */
 void blk_mq_complete_request(struct request *rq);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 6fca6a611c27f1f0d90fbe1cc3c229dbf8c09e48 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 May 2014 08:08:02 -0600
Subject: blk-mq: add helper to insert requests from irq context

Both the cache flush state machine and the SCSI midlayer want to submit
requests from irq context, and the current per-request requeue_work
unfortunately causes corruption due to sharing with the csd field for
flushes.  Replace them with a per-request_queue list of requests to
be requeued.

Based on an earlier test by Ming Lei.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c      | 16 ++++---------
 block/blk-mq.c         | 64 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blk-mq.h |  2 ++
 include/linux/blkdev.h |  5 +++-
 4 files changed, 73 insertions(+), 14 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index ec7a224d6733..ef608b35d9be 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
 	blk_clear_rq_complete(rq);
 }
 
-static void mq_flush_run(struct work_struct *work)
-{
-	struct request *rq;
-
-	rq = container_of(work, struct request, requeue_work);
-
-	memset(&rq->csd, 0, sizeof(rq->csd));
-	blk_mq_insert_request(rq, false, true, false);
-}
-
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
-		INIT_WORK(&rq->requeue_work, mq_flush_run);
-		kblockd_schedule_work(&rq->requeue_work);
+		struct request_queue *q = rq->q;
+
+		blk_mq_add_to_requeue_list(rq, add_front);
+		blk_mq_kick_requeue_list(q);
 		return false;
 	} else {
 		if (add_front)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 010b878d53b3..67066ecc79c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -516,10 +516,68 @@ void blk_mq_requeue_request(struct request *rq)
 	blk_clear_rq_complete(rq);
 
 	BUG_ON(blk_queued_rq(rq));
-	blk_mq_insert_request(rq, true, true, false);
+	blk_mq_add_to_requeue_list(rq, true);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
+static void blk_mq_requeue_work(struct work_struct *work)
+{
+	struct request_queue *q =
+		container_of(work, struct request_queue, requeue_work);
+	LIST_HEAD(rq_list);
+	struct request *rq, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->requeue_lock, flags);
+	list_splice_init(&q->requeue_list, &rq_list);
+	spin_unlock_irqrestore(&q->requeue_lock, flags);
+
+	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+		if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+			continue;
+
+		rq->cmd_flags &= ~REQ_SOFTBARRIER;
+		list_del_init(&rq->queuelist);
+		blk_mq_insert_request(rq, true, false, false);
+	}
+
+	while (!list_empty(&rq_list)) {
+		rq = list_entry(rq_list.next, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_insert_request(rq, false, false, false);
+	}
+
+	blk_mq_run_queues(q, false);
+}
+
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	/*
+	 * We abuse this flag that is otherwise used by the I/O scheduler to
+	 * request head insertation from the workqueue.
+	 */
+	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+
+	spin_lock_irqsave(&q->requeue_lock, flags);
+	if (at_head) {
+		rq->cmd_flags |= REQ_SOFTBARRIER;
+		list_add(&rq->queuelist, &q->requeue_list);
+	} else {
+		list_add_tail(&rq->queuelist, &q->requeue_list);
+	}
+	spin_unlock_irqrestore(&q->requeue_lock, flags);
+}
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
+
+void blk_mq_kick_requeue_list(struct request_queue *q)
+{
+	kblockd_schedule_work(&q->requeue_work);
+}
+EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	return tags->rqs[tag];
@@ -1812,6 +1870,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	q->sg_reserved_size = INT_MAX;
 
+	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+	INIT_LIST_HEAD(&q->requeue_list);
+	spin_lock_init(&q->requeue_lock);
+
 	if (q->nr_hw_queues > 1)
 		blk_queue_make_request(q, blk_mq_make_request);
 	else
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b171fbe95c5..b9a74a386dbc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -172,6 +172,8 @@ void blk_mq_end_io(struct request *rq, int error);
 void __blk_mq_end_io(struct request *rq, int error);
 
 void blk_mq_requeue_request(struct request *rq);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
+void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6bc011a09e82..913f1c2d3be0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,7 +99,6 @@ struct request {
 	struct list_head queuelist;
 	union {
 		struct call_single_data csd;
-		struct work_struct requeue_work;
 		unsigned long fifo_time;
 	};
 
@@ -463,6 +462,10 @@ struct request_queue {
 	struct request		*flush_rq;
 	spinlock_t		mq_flush_lock;
 
+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
-- 
cgit v1.2.3


From 4ce01dd1a07d9cf3eaf44fbf4ea9a61b11badccc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 May 2014 20:59:46 +0200
Subject: blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request

Instead of having two almost identical copies of the same code just let
the callers pass in the reserved flag directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 20 +++-----------------
 include/linux/blk-mq.h |  4 ++--
 3 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29d5fbafd94a..d87be5b4e554 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1173,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask);
+		return blk_mq_alloc_request(q, rw, gfp_mask, false);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 67066ecc79c0..63d581d72a70 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -294,35 +294,21 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
+		bool reserved)
 {
 	struct request *rq;
 
 	if (blk_mq_queue_enter(q))
 		return NULL;
 
-	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
+	rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
 	if (rq)
 		blk_mq_put_ctx(rq->mq_ctx);
 	return rq;
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
-					      gfp_t gfp)
-{
-	struct request *rq;
-
-	if (blk_mq_queue_enter(q))
-		return NULL;
-
-	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
-	if (rq)
-		blk_mq_put_ctx(rq->mq_ctx);
-	return rq;
-}
-EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
-
 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 				  struct blk_mq_ctx *ctx, struct request *rq)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b9a74a386dbc..2bd82f399128 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -160,8 +160,8 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		gfp_t gfp, bool reserved);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-- 
cgit v1.2.3


From cdef54dd85ad66e77262ea57796a3e81683dd5d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 May 2014 18:11:06 +0200
Subject: blk-mq: remove alloc_hctx and free_hctx methods

There is no need for drivers to control hardware context allocation
now that we do the context to node mapping in common code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c             | 26 +++++---------------------
 drivers/block/null_blk.c   | 28 +---------------------------
 drivers/block/virtio_blk.c |  2 --
 include/linux/blk-mq.h     | 10 ----------
 4 files changed, 6 insertions(+), 60 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5cc4b871cb11..f27fe44230c2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1335,21 +1335,6 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
-						   unsigned int hctx_index,
-						   int node)
-{
-	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
-}
-EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
-
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
-				 unsigned int hctx_index)
-{
-	kfree(hctx);
-}
-EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
-
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
@@ -1590,7 +1575,7 @@ static void blk_mq_free_hw_queues(struct request_queue *q,
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		free_cpumask_var(hctx->cpumask);
-		set->ops->free_hctx(hctx, i);
+		kfree(hctx);
 	}
 }
 
@@ -1811,7 +1796,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int node = blk_mq_hw_queue_to_node(map, i);
 
-		hctxs[i] = set->ops->alloc_hctx(set, i, node);
+		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+					GFP_KERNEL, node);
 		if (!hctxs[i])
 			goto err_hctxs;
 
@@ -1898,7 +1884,7 @@ err_hctxs:
 		if (!hctxs[i])
 			break;
 		free_cpumask_var(hctxs[i]->cpumask);
-		set->ops->free_hctx(hctxs[i], i);
+		kfree(hctxs[i]);
 	}
 err_map:
 	kfree(hctxs);
@@ -1983,9 +1969,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
 
-	if (!set->nr_hw_queues ||
-	    !set->ops->queue_rq || !set->ops->map_queue ||
-	    !set->ops->alloc_hctx || !set->ops->free_hctx)
+	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
 		return -EINVAL;
 
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 4d33c8c25fbf..b40af63a5476 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -321,18 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set,
-					     unsigned int hctx_index,
-					     int node)
-{
-	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
-}
-
-static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
-{
-	kfree(hctx);
-}
-
 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
 {
 	BUG_ON(!nullb);
@@ -360,17 +348,6 @@ static struct blk_mq_ops null_mq_ops = {
 	.map_queue      = blk_mq_map_queue,
 	.init_hctx	= null_init_hctx,
 	.complete	= null_softirq_done_fn,
-	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
-	.free_hctx	= blk_mq_free_single_hw_queue,
-};
-
-static struct blk_mq_ops null_mq_ops_pernode = {
-	.queue_rq       = null_queue_rq,
-	.map_queue      = blk_mq_map_queue,
-	.init_hctx	= null_init_hctx,
-	.complete	= null_softirq_done_fn,
-	.alloc_hctx	= null_alloc_hctx,
-	.free_hctx	= null_free_hctx,
 };
 
 static void null_del_dev(struct nullb *nullb)
@@ -496,10 +473,7 @@ static int null_add_dev(void)
 		goto out_free_nullb;
 
 	if (queue_mode == NULL_Q_MQ) {
-		if (use_per_node_hctx)
-			nullb->tag_set.ops = &null_mq_ops_pernode;
-		else
-			nullb->tag_set.ops = &null_mq_ops;
+		nullb->tag_set.ops = &null_mq_ops;
 		nullb->tag_set.nr_hw_queues = submit_queues;
 		nullb->tag_set.queue_depth = hw_queue_depth;
 		nullb->tag_set.numa_node = home_node;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7a51f065edcd..16c21c0cb14d 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -497,8 +497,6 @@ static int virtblk_init_request(void *data, struct request *rq,
 static struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
 	.map_queue	= blk_mq_map_queue,
-	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
-	.free_hctx	= blk_mq_free_single_hw_queue,
 	.complete	= virtblk_request_done,
 	.init_request	= virtblk_init_request,
 };
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2bd82f399128..91dfb75ce39f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -79,9 +79,6 @@ struct blk_mq_tag_set {
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *,
-		unsigned int, int);
-typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_request_fn)(void *, struct request *, unsigned int,
@@ -107,12 +104,6 @@ struct blk_mq_ops {
 
 	softirq_done_fn		*complete;
 
-	/*
-	 * Override for hctx allocations (should probably go)
-	 */
-	alloc_hctx_fn		*alloc_hctx;
-	free_hctx_fn		*free_hctx;
-
 	/*
 	 * Called when the block layer side of a hardware queue has been
 	 * set up, allowing the driver to allocate/init matching structures.
@@ -166,7 +157,6 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
 void blk_mq_end_io(struct request *rq, int error);
 void __blk_mq_end_io(struct request *rq, int error);
-- 
cgit v1.2.3


From 05f1dd5315217398fc8d122bdee80f96a9f21274 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 29 May 2014 09:53:32 -0600
Subject: block: add queue flag for disabling SG merging

If devices are not SG starved, we waste a lot of time potentially
collapsing SG segments. Enough that 1.5% of the CPU time goes
to this, at only 400K IOPS. Add a queue flag, QUEUE_FLAG_NO_SG_MERGE,
which just returns the number of vectors in a bio instead of looping
over all segments and checking for collapsible ones.

Add a BLK_MQ_F_SG_MERGE flag so that drivers can opt-in on the sg
merging, if they so desire.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c      | 28 +++++++++++++++++++++-------
 block/blk-mq.c         |  3 +++
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h |  1 +
 4 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6c583f9c5b65..b3bf0df0f4c2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, high, highprv = 1;
+	int cluster, high, highprv = 1, no_sg_merge;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
@@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
+	no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
+			/*
+			 * If SG merging is disabled, each bio vector is
+			 * a segment
+			 */
+			if (no_sg_merge)
+				goto new_segment;
+
 			/*
 			 * the trick here is making sure that a high page is
-			 * never considered part of another segment, since that
-			 * might change with the bounce page.
+			 * never considered part of another segment, since
+			 * that might change with the bounce page.
 			 */
 			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
 			if (!high && !highprv && cluster) {
@@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	struct bio *nxt = bio->bi_next;
+	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
+		bio->bi_phys_segments = bio->bi_vcnt;
+	else {
+		struct bio *nxt = bio->bi_next;
+
+		bio->bi_next = NULL;
+		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+		bio->bi_next = nxt;
+	}
 
-	bio->bi_next = NULL;
-	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
-	bio->bi_next = nxt;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f27fe44230c2..f98d977fd150 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1829,6 +1829,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
+	if (!(set->flags & BLK_MQ_F_SG_MERGE))
+		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
+
 	q->sg_reserved_size = INT_MAX;
 
 	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 91dfb75ce39f..95de239444d2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -129,6 +129,7 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
 	BLK_MQ_F_TAG_SHARED	= 1 << 2,
+	BLK_MQ_F_SG_MERGE	= 1 << 3,
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 098304576d51..695b9fd41efe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -510,6 +510,7 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From 2230237500821aedfcf2bba2a79d9cbca389233c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Fri, 30 May 2014 08:06:42 -0600
Subject: blk-mq: blk_mq_tag_to_rq should handle flush request

flush request is special, which borrows the tag from the parent
request. Hence blk_mq_tag_to_rq needs special handling to return
the flush request from the tag.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c      |  4 +++-
 block/blk-mq.c         | 12 +++++++++---
 include/linux/blk-mq.h |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index ef608b35d9be..ff87c664b7df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -223,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
 	struct request *rq, *n;
 	unsigned long flags = 0;
 
-	if (q->mq_ops)
+	if (q->mq_ops) {
 		spin_lock_irqsave(&q->mq_flush_lock, flags);
+		q->flush_rq->cmd_flags = 0;
+	}
 
 	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6160128085fc..21f952ab3581 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -541,9 +541,15 @@ void blk_mq_kick_requeue_list(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 {
-	return tags->rqs[tag];
+	struct request_queue *q = hctx->queue;
+
+	if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
+	    q->flush_rq->tag == tag)
+		return q->flush_rq;
+
+	return hctx->tags->rqs[tag];
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
@@ -572,7 +578,7 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 		if (tag >= hctx->tags->nr_tags)
 			break;
 
-		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
+		rq = blk_mq_tag_to_rq(hctx, tag++);
 		if (rq->q != hctx->queue)
 			continue;
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95de239444d2..ad3adb73cc70 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -154,7 +154,7 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
-- 
cgit v1.2.3


From 67aec14ce87fe25bdfff7dbf468556333df11c4e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 30 May 2014 08:25:36 -0600
Subject: blk-mq: make the sysfs mq/ layout reflect current mappings

Currently blk-mq registers all the hardware queues in sysfs,
regardless of whether it uses them (e.g. they have CPU mappings)
or not. The unused hardware queues lack the cpux/ directories,
and the other sysfs entries (like active, pending, etc) are all
zeroes.

Change this so that sysfs correctly reflects the current mappings
of the hardware queues.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c   | 102 ++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq.c         |   4 ++
 block/blk-mq.h         |   6 +++
 include/linux/blk-mq.h |   1 +
 4 files changed, 94 insertions(+), 19 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 99a60a829e69..e5f575ff0bf9 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -327,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
 	.release	= blk_mq_sysfs_release,
 };
 
+void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return;
+
+	hctx_for_each_ctx(hctx, ctx, i)
+		kobject_del(&ctx->kobj);
+
+	kobject_del(&hctx->kobj);
+}
+
+int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	int i, ret;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return 0;
+
+	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	if (ret)
+		return ret;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 void blk_mq_unregister_disk(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
@@ -335,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	int i, j;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx_for_each_ctx(hctx, ctx, j) {
-			kobject_del(&ctx->kobj);
+		blk_mq_unregister_hctx(hctx);
+
+		hctx_for_each_ctx(hctx, ctx, j)
 			kobject_put(&ctx->kobj);
-		}
-		kobject_del(&hctx->kobj);
+
 		kobject_put(&hctx->kobj);
 	}
 
@@ -350,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_put(&disk_to_dev(disk)->kobj);
 }
 
+static void blk_mq_sysfs_init(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	int i, j;
+
+	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+
+		hctx_for_each_ctx(hctx, ctx, j)
+			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+	}
+}
+
 int blk_mq_register_disk(struct gendisk *disk)
 {
 	struct device *dev = disk_to_dev(disk);
 	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	int ret, i, j;
+	int ret, i;
 
-	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+	blk_mq_sysfs_init(q);
 
 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
@@ -367,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-		ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+		hctx->flags |= BLK_MQ_F_SYSFS_UP;
+		ret = blk_mq_register_hctx(hctx);
 		if (ret)
 			break;
-
-		if (!hctx->nr_ctx)
-			continue;
-
-		hctx_for_each_ctx(hctx, ctx, j) {
-			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-			ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
-			if (ret)
-				break;
-		}
 	}
 
 	if (ret) {
@@ -390,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)
 
 	return 0;
 }
+
+void blk_mq_sysfs_unregister(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_unregister_hctx(hctx);
+}
+
+int blk_mq_sysfs_register(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret = 0;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_register_hctx(hctx);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 21f952ab3581..71f564e8812e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1924,6 +1924,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 {
 	blk_mq_freeze_queue(q);
 
+	blk_mq_sysfs_unregister(q);
+
 	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
 
 	/*
@@ -1934,6 +1936,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 
 	blk_mq_map_swqueue(q);
 
+	blk_mq_sysfs_register(q);
+
 	blk_mq_unfreeze_queue(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ff5e6bf0f691..de7b3bbd5bd6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -54,6 +54,12 @@ extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
+/*
+ * sysfs helpers
+ */
+extern int blk_mq_sysfs_register(struct request_queue *q);
+extern void blk_mq_sysfs_unregister(struct request_queue *q);
+
 /*
  * Basic implementation of sparser bitmap, allowing the user to spread
  * the bits over more cachelines.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ad3adb73cc70..c15128833100 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -130,6 +130,7 @@ enum {
 	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
 	BLK_MQ_F_TAG_SHARED	= 1 << 2,
 	BLK_MQ_F_SG_MERGE	= 1 << 3,
+	BLK_MQ_F_SYSFS_UP	= 1 << 4,
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
-- 
cgit v1.2.3


From 0e62f51f8753b048f391ee2d7f2af1f7297b0be5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 4 Jun 2014 10:23:49 -0600
Subject: blk-mq: let blk_mq_tag_to_rq() take blk_mq_tags as the main parameter

We currently pass in the hardware queue, and get the tags from there.
But from scsi-mq, with a shared tag space, it's a lot more convenient
to pass in the blk_mq_tags instead as the hardware queue isn't always
directly available. So instead of having to re-map to a given
hardware queue from rq->mq_ctx, just pass in the tags structure.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c                    | 19 ++++++++++++-------
 drivers/block/mtip32xx/mtip32xx.c |  4 +++-
 include/linux/blk-mq.h            |  2 +-
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e8e8cf00815..4e4cd6208052 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -529,15 +529,20 @@ void blk_mq_kick_requeue_list(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
-struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static inline bool is_flush_request(struct request *rq, unsigned int tag)
 {
-	struct request_queue *q = hctx->queue;
+	return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
+			rq->q->flush_rq->tag == tag);
+}
 
-	if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
-	    q->flush_rq->tag == tag)
-		return q->flush_rq;
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+{
+	struct request *rq = tags->rqs[tag];
+
+	if (!is_flush_request(rq, tag))
+		return rq;
 
-	return hctx->tags->rqs[tag];
+	return rq->q->flush_rq;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
@@ -566,7 +571,7 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 		if (tag >= hctx->tags->nr_tags)
 			break;
 
-		rq = blk_mq_tag_to_rq(hctx, tag++);
+		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
 		if (rq->q != hctx->queue)
 			continue;
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index abc858b3528b..74abd49fabdc 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -193,7 +193,9 @@ static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd)
 static struct request *mtip_rq_from_tag(struct driver_data *dd,
 					unsigned int tag)
 {
-	return blk_mq_tag_to_rq(dd->queue->queue_hw_ctx[0], tag);
+	struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
+
+	return blk_mq_tag_to_rq(hctx->tags, tag);
 }
 
 static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c15128833100..0feedebfde48 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -155,7 +155,7 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
-struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
-- 
cgit v1.2.3