diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 220 |
1 files changed, 148 insertions, 72 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37087a7fac22..5bedf6e3ebf3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include <linux/latencytop.h> +#include <linux/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -35,12 +36,26 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif @@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { + int target = -1; + + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + target = cpu; - affine_sd = tmp; - want_affine = 0; + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } + cpu = target; + } } if (!want_sd && !want_affine) @@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ @@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) + goto preempt; - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); - } + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1679,7 +1744,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { @@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } @@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, |