From 5a5bafdc396b1da7570f84fb96a0f8a288970c5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:14:56 -0800 Subject: elevator: clear auxiliary data earlier during elevator switch Elevator switch tries hard to keep as much as context until new elevator is ready so that it can revert to the original state if initializing the new elevator fails for some reason. Unfortunately, with more auxiliary contexts to manage, this makes elevator init and exit paths too complex and fragile. This patch makes elevator_switch() unregister the current elevator and flush icq's before start initializing the new one. As we still keep the old elevator itself, the only difference is that we lose icq's on rare occassions of switching failure, which isn't critical at all. Note that this makes explicit elevator parameter to elevator_init_queue() and __elv_register_queue() unnecessary as they always can use the current elevator. This patch enables block cgroup cleanups. -v2: blk_add_trace_msg() prints elevator name from @new_e instead of @e->type as the local variable no longer exists. This caused build failure on CONFIG_BLK_DEV_IO_TRACE. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 90 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 45 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index f016855a46b0..f8c08e1bff2b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -121,11 +121,10 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) +static int elevator_init_queue(struct request_queue *q) { - eq->elevator_data = eq->type->ops.elevator_init_fn(q); - if (eq->elevator_data) + q->elevator->elevator_data = q->elevator->type->ops.elevator_init_fn(q); + if (q->elevator->elevator_data) return 0; return -ENOMEM; } @@ -188,7 +187,6 @@ static void elevator_release(struct kobject *kobj) int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; - struct elevator_queue *eq; int err; if (unlikely(q->elevator)) @@ -222,17 +220,16 @@ int elevator_init(struct request_queue *q, char *name) } } - eq = elevator_alloc(q, e); - if (!eq) + q->elevator = elevator_alloc(q, e); + if (!q->elevator) return -ENOMEM; - err = elevator_init_queue(q, eq); + err = elevator_init_queue(q); if (err) { - kobject_put(&eq->kobj); + kobject_put(&q->elevator->kobj); return err; } - q->elevator = eq; return 0; } EXPORT_SYMBOL(elevator_init); @@ -801,8 +798,9 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) +int elv_register_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; int error; error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); @@ -820,11 +818,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) } return error; } - -int elv_register_queue(struct request_queue *q) -{ - return __elv_register_queue(q, q->elevator); -} EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) @@ -907,51 +900,58 @@ EXPORT_SYMBOL_GPL(elv_unregister); */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old_elevator, *e; + struct elevator_queue *old = q->elevator; + bool registered = old->registered; int err; - /* allocate new elevator */ - e = elevator_alloc(q, new_e); - if (!e) - return -ENOMEM; - - err = elevator_init_queue(q, e); - if (err) { - kobject_put(&e->kobj); - return err; - } - - /* turn on BYPASS and drain all requests w/ elevator private data */ + /* + * Turn on BYPASS and drain all requests w/ elevator private data. + * Block layer doesn't call into a quiesced elevator - all requests + * are directly put on the dispatch list without elevator data + * using INSERT_BACK. All requests have SOFTBARRIER set and no + * merge happens either. + */ elv_quiesce_start(q); - /* unregister old queue, register new one and kill old elevator */ - if (q->elevator->registered) { + /* unregister and clear all auxiliary data of the old elevator */ + if (registered) elv_unregister_queue(q); - err = __elv_register_queue(q, e); - if (err) - goto fail_register; - } - /* done, clear io_cq's, switch elevators and turn off BYPASS */ spin_lock_irq(q->queue_lock); ioc_clear_queue(q); - old_elevator = q->elevator; - q->elevator = e; spin_unlock_irq(q->queue_lock); - elevator_exit(old_elevator); + /* allocate, init and register new elevator */ + err = -ENOMEM; + q->elevator = elevator_alloc(q, new_e); + if (!q->elevator) + goto fail_init; + + err = elevator_init_queue(q); + if (err) { + kobject_put(&q->elevator->kobj); + goto fail_init; + } + + if (registered) { + err = elv_register_queue(q); + if (err) + goto fail_register; + } + + /* done, kill the old one and finish */ + elevator_exit(old); elv_quiesce_end(q); - blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); return 0; fail_register: - /* - * switch failed, exit the new io scheduler and reattach the old - * one again (along with re-adding the sysfs dir) - */ - elevator_exit(e); + elevator_exit(q->elevator); +fail_init: + /* switch failed, restore and re-register old elevator */ + q->elevator = old; elv_register_queue(q); elv_quiesce_end(q); -- cgit v1.2.1 From b2fab5acd28ead6f0dd6c3996ba23f0ef1772f15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:14:57 -0800 Subject: elevator: make elevator_init_fn() return 0/-errno elevator_ops->elevator_init_fn() has a weird return value. It returns a void * which the caller should assign to q->elevator->elevator_data and %NULL return denotes init failure. Update such that it returns integer 0/-errno and sets elevator_data directly as necessary. This makes the interface more conventional and eases further cleanup. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index f8c08e1bff2b..f81c061dad15 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -121,14 +121,6 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_init_queue(struct request_queue *q) -{ - q->elevator->elevator_data = q->elevator->type->ops.elevator_init_fn(q); - if (q->elevator->elevator_data) - return 0; - return -ENOMEM; -} - static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) @@ -224,7 +216,7 @@ int elevator_init(struct request_queue *q, char *name) if (!q->elevator) return -ENOMEM; - err = elevator_init_queue(q); + err = e->ops.elevator_init_fn(q); if (err) { kobject_put(&q->elevator->kobj); return err; @@ -927,7 +919,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (!q->elevator) goto fail_init; - err = elevator_init_queue(q); + err = new_e->ops.elevator_init_fn(q); if (err) { kobject_put(&q->elevator->kobj); goto fail_init; -- cgit v1.2.1 From d732580b4eb31553c63744a47d590f770cafb8f0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:14:58 -0800 Subject: block: implement blk_queue_bypass_start/end() Rename and extend elv_queisce_start/end() to blk_queue_bypass_start/end() which are exported and supports nesting via @q->bypass_depth. Also add blk_queue_bypass() to test bypass state. This will be further extended and used for blkio_group management. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index f81c061dad15..0bdea0ed03a3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -553,25 +553,6 @@ void elv_drain_elevator(struct request_queue *q) } } -void elv_quiesce_start(struct request_queue *q) -{ - if (!q->elevator) - return; - - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); - - blk_drain_queue(q, false); -} - -void elv_quiesce_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); -} - void __elv_add_request(struct request_queue *q, struct request *rq, int where) { trace_block_rq_insert(q, rq); @@ -903,7 +884,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * using INSERT_BACK. All requests have SOFTBARRIER set and no * merge happens either. */ - elv_quiesce_start(q); + blk_queue_bypass_start(q); /* unregister and clear all auxiliary data of the old elevator */ if (registered) @@ -933,7 +914,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* done, kill the old one and finish */ elevator_exit(old); - elv_quiesce_end(q); + blk_queue_bypass_end(q); blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); @@ -945,7 +926,7 @@ fail_init: /* switch failed, restore and re-register old elevator */ q->elevator = old; elv_register_queue(q); - elv_quiesce_end(q); + blk_queue_bypass_end(q); return err; } -- cgit v1.2.1 From 72e06c255181537d0b3e1f657a9ed81655d745b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:15:00 -0800 Subject: blkcg: shoot down blkio_groups on elevator switch Elevator switch may involve changes to blkcg policies. Implement shoot down of blkio_groups. Combined with the previous bypass updates, the end goal is updating blkcg core such that it can ensure that blkcg's being affected become quiescent and don't have any per-blkg data hanging around before commencing any policy updates. Until queues are made aware of the policies that applies to them, as an interim step, all per-policy blkg data will be shot down. * blk-throtl doesn't need this change as it can't be disabled for a live queue; however, update it anyway as the scheduled blkg unification requires this behavior change. This means that blk-throtl configuration will be unnecessarily lost over elevator switch. This oddity will be removed after blkcg learns to associate individual policies with request_queues. * blk-throtl dosen't shoot down root_tg. This is to ease transition. Unified blkg will always have persistent root group and not shooting down root_tg for now eases transition to that point by avoiding having to update td->root_tg and is safe as blk-throtl can never be disabled -v2: Vivek pointed out that group list is not guaranteed to be empty on return from clear function if it raced cgroup removal and lost. Fix it by waiting a bit and retrying. This kludge will soon be removed once locking is updated such that blkg is never in limbo state between blkcg and request_queue locks. blk-throtl no longer shoots down root_tg to avoid breaking td->root_tg. Also, Nest queue_lock inside blkio_list_lock not the other way around to avoid introduce possible deadlock via blkcg lock. -v3: blkcg_clear_queue() repositioned and renamed to blkg_destroy_all() to increase consistency with later changes. cfq_clear_queue() updated to check q->elevator before dereferencing it to avoid NULL dereference on not fully initialized queues (used by later change). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index 0bdea0ed03a3..8c7561fd2c79 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -38,6 +38,7 @@ #include #include "blk.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -894,6 +895,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ioc_clear_queue(q); spin_unlock_irq(q->queue_lock); + blkg_destroy_all(q); + /* allocate, init and register new elevator */ err = -ENOMEM; q->elevator = elevator_alloc(q, new_e); -- cgit v1.2.1 From 03aa264ac15637b6f98374270bcdf31400965505 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:15:19 -0800 Subject: blkcg: let blkcg core manage per-queue blkg list and counter With the previous patch to move blkg list heads and counters to request_queue and blkg, logic to manage them in both policies are almost identical and can be moved to blkcg core. This patch moves blkg link logic into blkg_lookup_create(), implements common blkg unlink code in blkg_destroy(), and updates blkg_destory_all() so that it's policy specific and can skip root group. The updated blkg_destroy_all() is now used to both clear queue for bypassing and elv switching, and release all blkgs on q exit. This patch introduces a race window where policy [de]registration may race against queue blkg clearing. This can only be a problem on cfq unload and shouldn't be a real problem in practice (and we have many other places where this race already exists). Future patches will remove these unlikely races. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index 8c7561fd2c79..d4d39dab841a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -876,7 +876,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old = q->elevator; bool registered = old->registered; - int err; + int i, err; /* * Turn on BYPASS and drain all requests w/ elevator private data. @@ -895,7 +895,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ioc_clear_queue(q); spin_unlock_irq(q->queue_lock); - blkg_destroy_all(q); + for (i = 0; i < BLKIO_NR_POLICIES; i++) + blkg_destroy_all(q, i, false); /* allocate, init and register new elevator */ err = -ENOMEM; -- cgit v1.2.1 From e8989fae38d9831c72b20375a206a919ca468c52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:15:20 -0800 Subject: blkcg: unify blkg's for blkcg policies Currently, blkg is per cgroup-queue-policy combination. This is unnatural and leads to various convolutions in partially used duplicate fields in blkg, config / stat access, and general management of blkgs. This patch make blkg's per cgroup-queue and let them serve all policies. blkgs are now created and destroyed by blkcg core proper. This will allow further consolidation of common management logic into blkcg core and API with better defined semantics and layering. As a transitional step to untangle blkg management, elvswitch and policy [de]registration, all blkgs except the root blkg are being shot down during elvswitch and bypass. This patch adds blkg_root_update() to update root blkg in place on policy change. This is hacky and racy but should be good enough as interim step until we get locking simplified and switch over to proper in-place update for all blkgs. -v2: Root blkgs need to be updated on elvswitch too and blkg_alloc() comment wasn't updated according to the function change. Fixed. Both pointed out by Vivek. -v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for all policies. This freed root pd during elvswitch before the last queue finished exiting and led to oops. Directly invoke update_root_blkg_pd() only on BLKIO_POLICY_PROP from cfq_exit_queue(). This also is closer to what will be done with proper in-place blkg update. Reported by Vivek. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index d4d39dab841a..451654fadab0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -876,7 +876,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old = q->elevator; bool registered = old->registered; - int i, err; + int err; /* * Turn on BYPASS and drain all requests w/ elevator private data. @@ -895,8 +895,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ioc_clear_queue(q); spin_unlock_irq(q->queue_lock); - for (i = 0; i < BLKIO_NR_POLICIES; i++) - blkg_destroy_all(q, i, false); + blkg_destroy_all(q, false); /* allocate, init and register new elevator */ err = -ENOMEM; -- cgit v1.2.1 From 852c788f8365062c8a383c5a93f7f7289977cb50 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:15:27 -0800 Subject: block: implement bio_associate_current() IO scheduling and cgroup are tied to the issuing task via io_context and cgroup of %current. Unfortunately, there are cases where IOs need to be routed via a different task which makes scheduling and cgroup limit enforcement applied completely incorrectly. For example, all bios delayed by blk-throttle end up being issued by a delayed work item and get assigned the io_context of the worker task which happens to serve the work item and dumped to the default block cgroup. This is double confusing as bios which aren't delayed end up in the correct cgroup and makes using blk-throttle and cfq propio together impossible. Any code which punts IO issuing to another task is affected which is getting more and more common (e.g. btrfs). As both io_context and cgroup are firmly tied to task including userland visible APIs to manipulate them, it makes a lot of sense to match up tasks to bios. This patch implements bio_associate_current() which associates the specified bio with %current. The bio will record the associated ioc and blkcg at that point and block layer will use the recorded ones regardless of which task actually ends up issuing the bio. bio release puts the associated ioc and blkcg. It grabs and remembers ioc and blkcg instead of the task itself because task may already be dead by the time the bio is issued making ioc and blkcg inaccessible and those are all block layer cares about. elevator_set_req_fn() is updated such that the bio elvdata is being allocated for is available to the elevator. This doesn't update block cgroup policies yet. Further patches will implement the support. -v2: #ifdef CONFIG_BLK_CGROUP added around bio->bi_ioc dereference in rq_ioc() to fix build breakage. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Kent Overstreet Signed-off-by: Jens Axboe --- block/elevator.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index 451654fadab0..be3ab6df0fea 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -663,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); + return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } -- cgit v1.2.1 From a2b1693bac45ea3fe3ba612fd22c45f17449f610 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Apr 2012 13:11:33 -0700 Subject: blkcg: implement per-queue policy activation All blkcg policies were assumed to be enabled on all request_queues. Due to various implementation obstacles, during the recent blkcg core updates, this was temporarily implemented as shooting down all !root blkgs on elevator switch and policy [de]registration combined with half-broken in-place root blkg updates. In addition to being buggy and racy, this meant losing all blkcg configurations across those events. Now that blkcg is cleaned up enough, this patch replaces the temporary implementation with proper per-queue policy activation. Each blkcg policy should call the new blkcg_[de]activate_policy() to enable and disable the policy on a specific queue. blkcg_activate_policy() allocates and installs policy data for the policy for all existing blkgs. blkcg_deactivate_policy() does the reverse. If a policy is not enabled for a given queue, blkg printing / config functions skip the respective blkg for the queue. blkcg_activate_policy() also takes care of root blkg creation, and cfq_init_queue() and blk_throtl_init() are updated accordingly. This replaces blkcg_bypass_{start|end}() and update_root_blkg_pd() unnecessary. Dropped. v2: cfq_init_queue() was returning uninitialized @ret on root_group alloc failure if !CONFIG_CFQ_GROUP_IOSCHED. Fixed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/elevator.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block/elevator.c') diff --git a/block/elevator.c b/block/elevator.c index be3ab6df0fea..6a55d418896f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -896,8 +896,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ioc_clear_queue(q); spin_unlock_irq(q->queue_lock); - blkg_destroy_all(q, false); - /* allocate, init and register new elevator */ err = -ENOMEM; q->elevator = elevator_alloc(q, new_e); -- cgit v1.2.1