diff options
Diffstat (limited to 'block/cfq-iosched.c')
-rw-r--r-- | block/cfq-iosched.c | 259 |
1 files changed, 183 insertions, 76 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ab7a9e6a9b1c..f3799432676d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -185,7 +185,7 @@ struct cfq_group { int nr_cfqq; /* - * Per group busy queus average. Useful for workload slice calc. We + * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. @@ -300,7 +300,9 @@ struct cfq_data { /* List of cfq groups being managed on this device*/ struct hlist_head cfqg_list; - struct rcu_head rcu; + + /* Number of groups which are on blkcg->blkg_list */ + unsigned int nr_blkcg_linked_grps; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -367,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy); #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args); + blkg_path(&(cfqq)->cfqg->blkg), ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args); \ + blkg_path(&(cfqg)->blkg), ##args) \ #else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) #endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq2 == NULL) return rq1; - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && - !(rq1->cmd_flags & REQ_META)) - return rq2; + if (rq_is_sync(rq1) != rq_is_sync(rq2)) + return rq_is_sync(rq1) ? rq1 : rq2; + + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) + return rq1->cmd_flags & REQ_META ? rq1 : rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -990,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" - " sect=%u", used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, + "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", + used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, unaccounted_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); @@ -1014,28 +1013,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, cfqg->needs_update = true; } -static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, - struct blkio_cgroup *blkcg, int create) +static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, + struct cfq_group *cfqg, struct blkio_cgroup *blkcg) { - struct cfq_group *cfqg = NULL; - void *key = cfqd; - int i, j; - struct cfq_rb_root *st; struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; unsigned int major, minor; - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initialized yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. + */ + if (bdi->dev) { sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - goto done; - } - if (cfqg || !create) - goto done; + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, 0); + + cfqd->nr_blkcg_linked_grps++; + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +} + +/* + * Should be called from sleepable context. No request queue lock as per + * cpu stats are allocated dynamically and alloc_percpu needs to be called + * from sleepable context. + */ +static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = NULL; + int i, j, ret; + struct cfq_rb_root *st; cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); if (!cfqg) - goto done; + return NULL; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; @@ -1049,43 +1067,94 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, */ cfqg->ref = 1; + ret = blkio_alloc_blkg_stats(&cfqg->blkg); + if (ret) { + kfree(cfqg); + return NULL; + } + + return cfqg; +} + +static struct cfq_group * +cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) +{ + struct cfq_group *cfqg = NULL; + void *key = cfqd; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initialized yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. See code above. + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - 0); - - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + if (blkcg == &blkio_root_cgroup) + cfqg = &cfqd->root_group; + else + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + } -done: return cfqg; } /* - * Search for the cfq group current task belongs to. If create = 1, then also - * create the cfq group if it does not exist. request_queue lock must be held. + * Search for the cfq group current task belongs to. request_queue lock must + * be held. */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { struct blkio_cgroup *blkcg; - struct cfq_group *cfqg = NULL; + struct cfq_group *cfqg = NULL, *__cfqg = NULL; + struct request_queue *q = cfqd->queue; + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_cfqg(cfqd, blkcg); + if (cfqg) { + rcu_read_unlock(); + return cfqg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + * + * Not taking any queue reference here and assuming that queue is + * around by the time we return. CFQ queue allocation code does + * the same. It might be racy though. + */ + + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + cfqg = cfq_alloc_cfqg(cfqd); + + spin_lock_irq(q->queue_lock); rcu_read_lock(); blkcg = task_blkio_cgroup(current); - cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); - if (!cfqg && create) + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __cfqg = cfq_find_cfqg(cfqd, blkcg); + + if (__cfqg) { + kfree(cfqg); + rcu_read_unlock(); + return __cfqg; + } + + if (!cfqg) cfqg = &cfqd->root_group; + + cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); rcu_read_unlock(); return cfqg; } @@ -1118,6 +1187,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) return; for_each_cfqg_st(cfqg, i, j, st) BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + free_percpu(cfqg->blkg.stats_cpu); kfree(cfqg); } @@ -1176,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) } #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { return &cfqd->root_group; } @@ -1210,7 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rb_root *service_tree; int left; int new_cfqq = 1; - int group_changed = 0; service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq)); @@ -1281,7 +1350,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; - if ((add_front || !new_cfqq) && !group_changed) + if (add_front || !new_cfqq) return; cfq_group_notify_queue_add(cfqd, cfqq->cfqg); } @@ -1955,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ if (sample_valid(cic->ttime_samples) && (cfqq->slice_end - jiffies < cic->ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", - cic->ttime_mean); + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", + cic->ttime_mean); return; } @@ -2029,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); + return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); } /* @@ -2704,8 +2773,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); - if (ioc->ioc_data == cic) + if (rcu_dereference(ioc->ioc_data) == cic) { + spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); @@ -2911,7 +2983,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_group *cfqg; retry: - cfqg = cfq_get_cfqg(cfqd, 1); + cfqg = cfq_get_cfqg(cfqd); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -3718,9 +3790,6 @@ new_queue: return 0; queue_fail: - if (cic) - put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); @@ -3815,15 +3884,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) cfq_put_queue(cfqd->async_idle_cfqq); } -static void cfq_cfqd_free(struct rcu_head *head) -{ - kfree(container_of(head, struct cfq_data, rcu)); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; + bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -3842,7 +3907,13 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_put_async_queues(cfqd); cfq_release_cfq_groups(cfqd); - cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); + + /* + * If there are groups which we could not unlink from blkcg list, + * wait for a rcu period for them to be freed. + */ + if (cfqd->nr_blkcg_linked_grps) + wait = true; spin_unlock_irq(q->queue_lock); @@ -3852,8 +3923,25 @@ static void cfq_exit_queue(struct elevator_queue *e) ida_remove(&cic_index_ida, cfqd->cic_index); spin_unlock(&cic_index_lock); - /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ - call_rcu(&cfqd->rcu, cfq_cfqd_free); + /* + * Wait for cfqg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other unlinked groups out + * there. This can happen if cgroup deletion path claimed the + * responsibility of cleaning up a group before queue cleanup code + * get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* Free up per cpu stats for root group */ + free_percpu(cfqd->root_group.blkg.stats_cpu); +#endif + kfree(cfqd); } static int cfq_alloc_cic_index(void) @@ -3886,8 +3974,12 @@ static void *cfq_init_queue(struct request_queue *q) return NULL; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); - if (!cfqd) + if (!cfqd) { + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, i); + spin_unlock(&cic_index_lock); return NULL; + } /* * Don't need take queue_lock in the routine, since we are @@ -3909,14 +4001,29 @@ static void *cfq_init_queue(struct request_queue *q) #ifdef CONFIG_CFQ_GROUP_IOSCHED /* - * Take a reference to root group which we never drop. This is just - * to make sure that cfq_put_cfqg() does not try to kfree root group + * Set root group reference to 2. One reference will be dropped when + * all groups on cfqd->cfqg_list are being deleted during queue exit. + * Other reference will remain there as we don't want to delete this + * group as it is statically allocated and gets destroyed when + * throtl_data goes away. */ - cfqg->ref = 1; + cfqg->ref = 2; + + if (blkio_alloc_blkg_stats(&cfqg->blkg)) { + kfree(cfqg); + kfree(cfqd); + return NULL; + } + rcu_read_lock(); + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 0); rcu_read_unlock(); + cfqd->nr_blkcg_linked_grps++; + + /* Add group on cfqd->cfqg_list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); #endif /* * Not strictly needed (since RB_ROOT just clears the node and we |