summaryrefslogtreecommitdiffstats
path: root/block/blk-cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-01 10:39:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-01 10:39:57 -0700
commit694752922b12bd318aa80191bd9d8c3dcfb39055 (patch)
tree5afe83fd99100bea546dd5a1c1f778c58f41e5c0 /block/blk-cgroup.c
parenta351e9b9fc24e982ec2f0e76379a49826036da12 (diff)
parent9438b3e080beccf6022138ea62192d55cc7dc4ed (diff)
downloadtalos-obmc-linux-694752922b12bd318aa80191bd9d8c3dcfb39055.tar.gz
talos-obmc-linux-694752922b12bd318aa80191bd9d8c3dcfb39055.zip
Merge branch 'for-4.12/block' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: - Add BFQ IO scheduler under the new blk-mq scheduling framework. BFQ was initially a fork of CFQ, but subsequently changed to implement fairness based on B-WF2Q+, a modified variant of WF2Q. BFQ is meant to be used on desktop type single drives, providing good fairness. From Paolo. - Add Kyber IO scheduler. This is a full multiqueue aware scheduler, using a scalable token based algorithm that throttles IO based on live completion IO stats, similary to blk-wbt. From Omar. - A series from Jan, moving users to separately allocated backing devices. This continues the work of separating backing device life times, solving various problems with hot removal. - A series of updates for lightnvm, mostly from Javier. Includes a 'pblk' target that exposes an open channel SSD as a physical block device. - A series of fixes and improvements for nbd from Josef. - A series from Omar, removing queue sharing between devices on mostly legacy drivers. This helps us clean up other bits, if we know that a queue only has a single device backing. This has been overdue for more than a decade. - Fixes for the blk-stats, and improvements to unify the stats and user windows. This both improves blk-wbt, and enables other users to register a need to receive IO stats for a device. From Omar. - blk-throttle improvements from Shaohua. This provides a scalable framework for implementing scalable priotization - particularly for blk-mq, but applicable to any type of block device. The interface is marked experimental for now. - Bucketized IO stats for IO polling from Stephen Bates. This improves efficiency of polled workloads in the presence of mixed block size IO. - A few fixes for opal, from Scott. - A few pulls for NVMe, including a lot of fixes for NVMe-over-fabrics. From a variety of folks, mostly Sagi and James Smart. - A series from Bart, improving our exposed info and capabilities from the blk-mq debugfs support. - A series from Christoph, cleaning up how handle WRITE_ZEROES. - A series from Christoph, cleaning up the block layer handling of how we track errors in a request. On top of being a nice cleanup, it also shrinks the size of struct request a bit. - Removal of mg_disk and hd (sorry Linus) by Christoph. The former was never used by platforms, and the latter has outlived it's usefulness. - Various little bug fixes and cleanups from a wide variety of folks. * 'for-4.12/block' of git://git.kernel.dk/linux-block: (329 commits) block: hide badblocks attribute by default blk-mq: unify hctx delay_work and run_work block: add kblock_mod_delayed_work_on() blk-mq: unify hctx delayed_run_work and run_work nbd: fix use after free on module unload MAINTAINERS: bfq: Add Paolo as maintainer for the BFQ I/O scheduler blk-mq-sched: alloate reserved tags out of normal pool mtip32xx: use runtime tag to initialize command header scsi: Implement blk_mq_ops.show_rq() blk-mq: Add blk_mq_ops.show_rq() blk-mq: Show operation, cmd_flags and rq_flags names blk-mq: Make blk_flags_show() callers append a newline character blk-mq: Move the "state" debugfs attribute one level down blk-mq: Unregister debugfs attributes earlier blk-mq: Only unregister hctxs for which registration succeeded blk-mq-debugfs: Rename functions for registering and unregistering the mq directory blk-mq: Let blk_mq_debugfs_register() look up the queue name blk-mq: Register <dev>/queue/mq after having registered <dev>/queue ide-pm: always pass 0 error to ide_complete_rq in ide_do_devset ide-pm: always pass 0 error to __blk_end_request_all ..
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r--block/blk-cgroup.c123
1 files changed, 98 insertions, 25 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
}
EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert_held(q->queue_lock);
+
+ if (!blkcg_policy_enabled(q, pol))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * This could be the first entry point of blkcg implementation and
+ * we shouldn't allow anything to go through for a bypassing queue.
+ */
+ if (unlikely(blk_queue_bypass(q)))
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+ return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
+ struct request_queue *q;
struct blkcg_gq *blkg;
struct module *owner;
unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
- rcu_read_lock();
- spin_lock_irq(disk->queue->queue_lock);
+ q = disk->queue;
- if (blkcg_policy_enabled(disk->queue, pol))
- blkg = blkg_lookup_create(blkcg, disk->queue);
- else
- blkg = ERR_PTR(-EOPNOTSUPP);
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg)
+ goto success;
+
+ /*
+ * Create blkgs walking down from blkcg_root to @blkcg, so that all
+ * non-root blkgs have access to their parents.
+ */
+ while (true) {
+ struct blkcg *pos = blkcg;
+ struct blkcg *parent;
+ struct blkcg_gq *new_blkg;
+
+ parent = blkcg_parent(blkcg);
+ while (parent && !__blkg_lookup(parent, q, false)) {
+ pos = parent;
+ parent = blkcg_parent(parent);
+ }
+
+ /* Drop locks to do new blkg allocation with GFP_KERNEL. */
+ spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
- spin_unlock_irq(disk->queue->queue_lock);
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- /*
- * If queue was bypassing, we should retry. Do so after a
- * short msleep(). It isn't strictly necessary but queue
- * can be bypassing for some time and it's always nice to
- * avoid busy looping.
- */
- if (ret == -EBUSY) {
- msleep(10);
- ret = restart_syscall();
+
+ new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+ if (unlikely(!new_blkg)) {
+ ret = -ENOMEM;
+ goto fail;
}
- return ret;
- }
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+
+ blkg = blkg_lookup_check(pos, pol, q);
+ if (IS_ERR(blkg)) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg) {
+ blkg_free(new_blkg);
+ } else {
+ blkg = blkg_create(pos, q, new_blkg);
+ if (unlikely(IS_ERR(blkg))) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+ }
+
+ if (pos == blkcg)
+ goto success;
+ }
+success:
ctx->disk = disk;
ctx->blkg = blkg;
ctx->body = body;
return 0;
+
+fail_unlock:
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+fail:
+ owner = disk->fops->owner;
+ put_disk(disk);
+ module_put(owner);
+ /*
+ * If queue was bypassing, we should retry. Do so after a
+ * short msleep(). It isn't strictly necessary but queue
+ * can be bypassing for some time and it's always nice to
+ * avoid busy looping.
+ */
+ if (ret == -EBUSY) {
+ msleep(10);
+ ret = restart_syscall();
+ }
+ return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
OpenPOWER on IntegriCloud