diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/clock.c | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 920 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 18 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 25 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 25 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 4 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 307 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 158 | ||||
-rw-r--r-- | kernel/sched/debug.c | 11 | ||||
-rw-r--r-- | kernel/sched/fair.c | 2042 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/idle.c | 67 | ||||
-rw-r--r-- | kernel/sched/isolation.c | 18 | ||||
-rw-r--r-- | kernel/sched/loadavg.c | 33 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 240 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 20 | ||||
-rw-r--r-- | kernel/sched/psi.c | 70 | ||||
-rw-r--r-- | kernel/sched/rt.c | 185 | ||||
-rw-r--r-- | kernel/sched/sched.h | 159 | ||||
-rw-r--r-- | kernel/sched/stats.h | 7 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 33 | ||||
-rw-r--r-- | kernel/sched/topology.c | 112 | ||||
-rw-r--r-- | kernel/sched/wait.c | 37 | ||||
-rw-r--r-- | kernel/sched/wait_bit.c | 1 |
24 files changed, 2973 insertions, 1526 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1152259a4ca0..12bca64dff73 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -393,7 +393,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -460,7 +460,7 @@ void __init sched_clock_init(void) u64 sched_clock_cpu(int cpu) { - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include <asm/tlb.h> #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -255,7 +256,7 @@ static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -314,7 +315,7 @@ void hrtick_start(struct rq *rq, u64 delay) */ delay = max_t(u64, delay, 10000LL); hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } #endif /* CONFIG_SMP */ @@ -328,7 +329,7 @@ static void hrtick_rq_init(struct rq *rq) rq->hrtick_csd.info = rq; #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } #else /* CONFIG_SCHED_HRTICK */ @@ -551,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; @@ -773,6 +779,18 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); + /* Max allowed minimum utilization */ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -798,7 +816,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline unsigned int uclamp_none(int clamp_id) +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -814,7 +832,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, } static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* @@ -830,7 +848,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id, return uclamp_none(UCLAMP_MIN); } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* Reset max-clamp retention only on idle exit */ @@ -841,8 +859,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, } static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, - unsigned int clamp_value) +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -861,16 +879,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct uclamp_se uc_max; + + /* + * Tasks in autogroups or root task group will be + * restricted by system defaults. + */ + if (task_group_is_autogroup(task_group(p))) + return uc_req; + if (task_group(p) == &root_task_group) + return uc_req; + + uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value || !uc_req.user_defined) + return uc_max; +#endif + + return uc_req; +} + /* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + * group or in an autogroup * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { - struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ @@ -880,17 +924,17 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) - return p->uclamp[clamp_id].value; + return (unsigned long)p->uclamp[clamp_id].value; uc_eff = uclamp_eff_get(p, clamp_id); - return uc_eff.value; + return (unsigned long)uc_eff.value; } /* @@ -904,7 +948,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -942,7 +986,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * enforce the expected state and warn. */ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -981,7 +1025,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -996,7 +1040,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1005,15 +1049,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct rq_flags rf; + struct rq *rq; + + /* + * Lock the task and the rq where the task is (or was) queued. + * + * We might lock the (previous) rq of a !RUNNABLE task, but that's the + * price to pay to safely serialize util_{min,max} updates with + * enqueues, dequeues and migration operations. + * This is the same locking schema used by __set_cpus_allowed_ptr(). + */ + rq = task_rq_lock(p, &rf); + + /* + * Setting the clamp bucket is serialized by task_rq_lock(). + * If the task is not yet RUNNABLE and its task_struct is not + * affecting a valid clamp bucket, the next time it's enqueued, + * it will already see the updated clamp bucket value. + */ + if (p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } + + task_rq_unlock(rq, p, &rf); +} + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, + unsigned int clamps) +{ + enum uclamp_id clamp_id; + struct css_task_iter it; + struct task_struct *p; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) { + for_each_clamp_id(clamp_id) { + if ((0x1 << clamp_id) & clamps) + uclamp_update_active(p, clamp_id); + } + } + css_task_iter_end(&it); +} + +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ + struct task_group *tg = &root_task_group; + + uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], + sysctl_sched_uclamp_util_min, false); + uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], + sysctl_sched_uclamp_util_max, false); + + rcu_read_lock(); + cpu_util_update_eff(&root_task_group.css); + rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif + int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + bool update_root_tg = false; int old_min, old_max; - static DEFINE_MUTEX(mutex); int result; - mutex_lock(&mutex); + mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; @@ -1032,23 +1143,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min, false); + update_root_tg = true; } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); + update_root_tg = true; } + if (update_root_tg) + uclamp_update_root_tg(); + /* - * Updating all the RUNNABLE task is expensive, keep it simple and do - * just a lazy update at each next enqueue time. + * We update all RUNNABLE tasks only when task groups are in use. + * Otherwise, keep it simple and do just a lazy update at each next + * task enqueue time. */ + goto done; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; done: - mutex_unlock(&mutex); + mutex_unlock(&uclamp_mutex); return result; } @@ -1075,7 +1193,7 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { - unsigned int clamp_id; + enum uclamp_id clamp_id; /* * On scheduling class change, reset to default clamps for tasks @@ -1112,7 +1230,7 @@ static void __setscheduler_uclamp(struct task_struct *p, static void uclamp_fork(struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1134,11 +1252,14 @@ static void uclamp_fork(struct task_struct *p) static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; - unsigned int clamp_id; + enum uclamp_id clamp_id; int cpu; + mutex_init(&uclamp_mutex); + for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + memset(&cpu_rq(cpu)->uclamp, 0, + sizeof(struct uclamp_rq)*UCLAMP_CNT); cpu_rq(cpu)->uclamp_flags = 0; } @@ -1149,8 +1270,13 @@ static void __init init_uclamp(void) /* System defaults allow max clamp values for both indexes */ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); - for_each_clamp_id(clamp_id) + for_each_clamp_id(clamp_id) { uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP + root_task_group.uclamp_req[clamp_id] = uc_max; + root_task_group.uclamp[clamp_id] = uc_max; +#endif + } } #else /* CONFIG_UCLAMP_TASK */ @@ -1321,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1494,7 +1609,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -1537,7 +1652,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1558,7 +1674,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -2986,7 +3101,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = next; @@ -3135,7 +3250,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); @@ -3214,12 +3329,8 @@ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { - struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -3228,22 +3339,37 @@ context_switch(struct rq *rq, struct task_struct *prev, arch_start_context_switch(prev); /* - * If mm is non-NULL, we pass through switch_mm(). If mm is - * NULL, we will pass through mmdrop() in finish_task_switch(). - * Both of these contain the full memory barrier required by - * membarrier after storing to rq->curr, before returning to - * user-space. + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch */ - if (!mm) { - next->active_mm = oldmm; - mmgrab(oldmm); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr / membarrier_switch_mm() and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); @@ -3486,8 +3612,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3654,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3508,38 +3663,47 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } @@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3700,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev) /* * Various schedule()-time debugging checks and statistics: */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt) { #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + if (!preempt && prev->state && prev->non_block_count) { + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", + prev->comm, prev->pid, prev->non_block_count); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } +#endif + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); preempt_count_set(PREEMPT_DISABLED); @@ -3737,25 +3917,41 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, rf); + p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, rf); + if (!p) { + put_prev_task(rq, prev); + p = pick_next_task_idle(rq); + } return p; } -again: +restart: +#ifdef CONFIG_SMP + /* + * We must do the balancing pass before put_next_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq); + if (p) return p; - } } /* The idle class should always have a runnable task: */ @@ -3782,7 +3978,7 @@ again: * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3987,7 @@ again: * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -3813,7 +4009,7 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev); + schedule_debug(prev, preempt); if (sched_feat(HRTICK)) hrtick_clear(rq); @@ -3857,7 +4053,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -3904,7 +4104,7 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; /* @@ -3914,12 +4114,18 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3930,8 +4136,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) @@ -4033,11 +4243,10 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4105,10 +4314,10 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -4273,7 +4482,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4294,7 +4503,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio, delta; + int old_prio; struct rq_flags rf; struct rq *rq; @@ -4328,19 +4537,18 @@ void set_user_nice(struct task_struct *p, long nice) set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4657,6 +4865,9 @@ recheck: return retval; } + if (pi) + cpuset_read_lock(); + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4671,8 +4882,8 @@ recheck: * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } /* @@ -4690,8 +4901,8 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &rf); - return 0; + retval = 0; + goto unlock; } change: @@ -4704,8 +4915,8 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } #endif #ifdef CONFIG_SMP @@ -4720,8 +4931,8 @@ change: */ if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } } #endif @@ -4731,6 +4942,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); goto recheck; } @@ -4740,8 +4953,8 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &rf); - return -EBUSY; + retval = -EBUSY; + goto unlock; } p->sched_reset_on_fork = reset_on_fork; @@ -4783,7 +4996,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -4791,14 +5004,22 @@ change: preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + cpuset_read_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); preempt_enable(); return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); + return retval; } static int _sched_setscheduler(struct task_struct *p, int policy, @@ -4882,10 +5103,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); + } + return retval; } @@ -4897,9 +5123,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -4907,45 +5130,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5102,37 +5299,40 @@ out_unlock: return retval; } -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) +/* + * Copy the kernel size attribute structure (which might be larger + * than what user-space knows about) to user-space. + * + * Note that all cases are valid: user-space buffer can be larger or + * smaller than the kernel-space buffer. The usual case is that both + * have the same size. + */ +static int +sched_attr_copy_to_user(struct sched_attr __user *uattr, + struct sched_attr *kattr, + unsigned int usize) { - int ret; + unsigned int ksize = sizeof(*kattr); if (!access_ok(uattr, usize)) return -EFAULT; /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. + * sched_getattr() ABI forwards and backwards compatibility: + * + * If usize == ksize then we just copy everything to user-space and all is good. + * + * If usize < ksize then we only copy as much as user-space has space for, + * this keeps ABI compatibility as well. We skip the rest. + * + * If usize > ksize then user-space is using a newer version of the ABI, + * which part the kernel doesn't know about. Just ignore it - tooling can + * detect the kernel's knowledge of attributes from the attr->size value + * which is set to ksize in this case. */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; - - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); + kattr->size = min(usize, ksize); - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; - } - - attr->size = usize; - } - - ret = copy_to_user(uattr, attr, attr->size); - if (ret) + if (copy_to_user(uattr, kattr, kattr->size)) return -EFAULT; return 0; @@ -5142,20 +5342,18 @@ static int sched_read_attr(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) + unsigned int, usize, unsigned int, flags) { - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; + struct sched_attr kattr = { }; struct task_struct *p; int retval; - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) + if (!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); @@ -5168,25 +5366,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) goto out_unlock; - attr.sched_policy = p->policy; + kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; if (task_has_dl_policy(p)) - __getparam_dl(p, &attr); + __getparam_dl(p, &kattr); else if (task_has_rt_policy(p)) - attr.sched_priority = p->rt_priority; + kattr.sched_priority = p->rt_priority; else - attr.sched_nice = task_nice(p); + kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK - attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif rcu_read_unlock(); - retval = sched_read_attr(uattr, &attr, size); - return retval; + return sched_attr_copy_to_user(uattr, &kattr, usize); out_unlock: rcu_read_unlock(); @@ -5416,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5433,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched); * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ @@ -5830,10 +6027,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; @@ -5863,7 +6061,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; @@ -5972,7 +6171,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6012,21 +6211,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq) { -} + const struct sched_class *class; + struct task_struct *next; -static const struct sched_class fake_sched_class = { - .put_prev_task = put_prev_task_fake, -}; + for_each_class(class) { + next = class->pick_next_task(rq); + if (next) { + next->sched_class->put_prev_task(rq, next); + return next; + } + } -static struct task_struct fake_task = { - /* - * Avoid pull_{rt,dl}_task() - */ - .prio = MAX_PRIO + 1, - .sched_class = &fake_sched_class, -}; + /* The idle class should always have a runnable task */ + BUG(); +} /* * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6069,12 +6269,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) if (rq->nr_running == 1) break; - /* - * pick_next_task() assumes pinned rq->lock: - */ - next = pick_next_task(rq, &fake_task, rf); - BUG_ON(!next); - put_prev_task(rq, next); + next = __pick_migrate_task(rq); /* * Rules for changing task_struct::cpus_mask are holding @@ -6228,8 +6423,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } @@ -6371,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - unsigned long alloc_size = 0, ptr; + unsigned long ptr = 0; int i; wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; @@ -6570,7 +6763,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) return; @@ -6586,8 +6779,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset) "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, current->pid, current->comm); if (task_stack_end_corrupted(current)) @@ -6702,7 +6895,7 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * @@ -6727,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, + struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + enum uclamp_id clamp_id; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&tg->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; + } +#endif +} + static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -6750,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + alloc_uclamp_sched_group(tg, parent); + return tg; err: @@ -6852,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) - set_curr_task(rq, tsk); + if (running) { + set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } @@ -6889,6 +7105,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) if (parent) sched_online_group(tg, parent); + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* Propagate the effective uclamp value for the new group */ + cpu_util_update_eff(css); +#endif + return 0; } @@ -6936,10 +7158,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its @@ -6970,6 +7188,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *top_css = css; + struct uclamp_se *uc_parent = NULL; + struct uclamp_se *uc_se = NULL; + unsigned int eff[UCLAMP_CNT]; + enum uclamp_id clamp_id; + unsigned int clamps; + + css_for_each_descendant_pre(css, top_css) { + uc_parent = css_tg(css)->parent + ? css_tg(css)->parent->uclamp : NULL; + + for_each_clamp_id(clamp_id) { + /* Assume effective clamps matches requested clamps */ + eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; + /* Cap effective clamps with parent's effective clamps */ + if (uc_parent && + eff[clamp_id] > uc_parent[clamp_id].value) { + eff[clamp_id] = uc_parent[clamp_id].value; + } + } + /* Ensure protection is always capped by limit */ + eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + + /* Propagate most restrictive effective clamps */ + clamps = 0x0; + uc_se = css_tg(css)->uclamp; + for_each_clamp_id(clamp_id) { + if (eff[clamp_id] == uc_se[clamp_id].value) + continue; + uc_se[clamp_id].value = eff[clamp_id]; + uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); + clamps |= (0x1 << clamp_id); + } + if (!clamps) { + css = css_rightmost_descendant(css); + continue; + } + + /* Immediately update descendants RUNNABLE tasks */ + uclamp_update_active_tasks(css, clamps); + } +} + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT 2 +#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) + s64 percent; + u64 util; + int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ + struct uclamp_request req = { + .percent = UCLAMP_PERCENT_SCALE, + .util = SCHED_CAPACITY_SCALE, + .ret = 0, + }; + + buf = strim(buf); + if (strcmp(buf, "max")) { + req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, + &req.percent); + if (req.ret) + return req; + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { + req.ret = -ERANGE; + return req; + } + + req.util = req.percent << SCHED_CAPACITY_SHIFT; + req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); + } + + return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + enum uclamp_id clamp_id) +{ + struct uclamp_request req; + struct task_group *tg; + + req = capacity_from_percent(buf); + if (req.ret) + return req.ret; + + mutex_lock(&uclamp_mutex); + rcu_read_lock(); + + tg = css_tg(of_css(of)); + if (tg->uclamp_req[clamp_id].value != req.util) + uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + + /* + * Because of not recoverable conversion rounding we keep track of the + * exact requested value + */ + tg->uclamp_pct[clamp_id] = req.percent; + + /* Update effective clamps to track the most restrictive value */ + cpu_util_update_eff(of_css(of)); + + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); + + return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, + enum uclamp_id clamp_id) +{ + struct task_group *tg; + u64 util_clamp; + u64 percent; + u32 rem; + + rcu_read_lock(); + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + rcu_read_unlock(); + + if (util_clamp == SCHED_CAPACITY_SCALE) { + seq_puts(sf, "max\n"); + return; + } + + percent = tg->uclamp_pct[clamp_id]; + percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); + seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MIN); + return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MAX); + return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -7315,6 +7705,20 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, +#endif { } /* Terminate */ }; @@ -7482,6 +7886,20 @@ static struct cftype cpu_files[] = { .write = cpu_max_write, }, #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, +#endif { } /* terminate */ }; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index b5dcd1d83c7f..7c2fe50fd76d 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <linux/cpufreq.h> + #include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); @@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu) rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); } EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); + +/** + * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated. + * @policy: cpufreq policy to check. + * + * Return 'true' if: + * - the local and remote CPUs share @policy, + * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going + * offline (in which case it is not expected to run cpufreq updates any more). + */ +bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy) +{ + return cpumask_test_cpu(smp_processor_id(), policy->cpus) || + (policy->dvfs_possible_from_any_cpu && + rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data))); +} diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 867b4bb6d4be..7fbaee24c824 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * by the hardware, as calculating the frequency is pointless if * we cannot in fact act on it. * - * For the slow switching platforms, the kthread is always scheduled on - * the right set of CPUs and any CPU can find the next frequency and - * schedule the kthread. + * This is needed on the slow switching platforms too to prevent CPUs + * going offline from leaving stale IRQ work items behind. */ - if (sg_policy->policy->fast_switch_enabled && - !cpufreq_this_cpu_can_update(sg_policy->policy)) + if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; if (unlikely(sg_policy->limits_changed)) { @@ -117,6 +115,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { struct cpufreq_policy *policy = sg_policy->policy; + int cpu; if (!sugov_update_next_freq(sg_policy, time, next_freq)) return; @@ -126,7 +125,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, return; policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); + + if (trace_cpu_frequency_enabled()) { + for_each_cpu(cpu, policy->cpus) + trace_cpu_frequency(next_freq, cpu); + } } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, @@ -235,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util_with(rq, util, p); + util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); @@ -263,9 +266,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, * irq metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: * - * 1 - irq - * U' = irq + ------- * U - * max + * max - irq + * U' = irq + --------- * U + * max */ util = scale_irq_capacity(util, irq, max); util += irq; @@ -910,7 +913,7 @@ static int __init sugov_register(void) { return cpufreq_register_governor(&schedutil_gov); } -fs_initcall(sugov_register); +core_initcall(sugov_register); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b7abca987d94..1a2719e1350a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -46,6 +46,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -57,7 +59,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* @@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 7dc20a3232e7..32dd520db11f 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -18,7 +18,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2305ce89a26c..cff3e656566d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int ticks) + int ticks) { u64 other, cputime = TICK_NSEC * ticks; @@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); - } else if (p == rq->idle) { + } else if (p == this_rq()->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); @@ -392,40 +392,36 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, static void irqtime_account_idle_ticks(int ticks) { - struct rq *rq = this_rq(); - - irqtime_account_process_tick(current, 0, rq, ticks); + irqtime_account_process_tick(current, 0, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) { } + int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + # ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) vtime_account_idle(prev); else - vtime_account_system(prev); + vtime_account_kernel(prev); vtime_flush(prev); arch_vtime_task_switch(prev); } # endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * vtime_account_kernel() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -436,7 +432,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) if (!in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else - vtime_account_system(tsk); + vtime_account_kernel(tsk); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -475,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) void account_process_tick(struct task_struct *p, int user_tick) { u64 cputime, steal; - struct rq *rq = this_rq(); - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq, 1); + irqtime_account_process_tick(p, user_tick, 1); return; } @@ -495,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, cputime); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); @@ -711,8 +706,8 @@ static u64 get_vtime_delta(struct vtime *vtime) return delta - other; } -static void __vtime_account_system(struct task_struct *tsk, - struct vtime *vtime) +static void vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { vtime->stime += get_vtime_delta(vtime); if (vtime->stime >= TICK_NSEC) { @@ -731,7 +726,17 @@ static void vtime_account_guest(struct task_struct *tsk, } } -void vtime_account_system(struct task_struct *tsk) +static void __vtime_account_kernel(struct task_struct *tsk, + struct vtime *vtime) +{ + /* We might have scheduled out from guest path */ + if (vtime->state == VTIME_GUEST) + vtime_account_guest(tsk, vtime); + else + vtime_account_system(tsk, vtime); +} + +void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -739,11 +744,7 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) - vtime_account_guest(tsk, vtime); - else - __vtime_account_system(tsk, vtime); + __vtime_account_kernel(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -752,7 +753,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -782,8 +783,9 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + vtime_account_system(tsk, vtime); + tsk->flags |= PF_VCPU; + vtime->state = VTIME_GUEST; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -794,7 +796,8 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; + vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -804,19 +807,30 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(get_vtime_delta(&tsk->vtime)); } -void arch_vtime_task_switch(struct task_struct *prev) +void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); + if (vtime->state == VTIME_IDLE) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; write_seqcount_end(&vtime->seqcount); vtime = ¤t->vtime; write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + if (is_idle_task(current)) + vtime->state = VTIME_IDLE; + else if (current->flags & PF_VCPU) + vtime->state = VTIME_GUEST; + else + vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); } @@ -827,8 +841,9 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + vtime->state = VTIME_IDLE; vtime->starttime = sched_clock(); + vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -846,7 +861,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -877,20 +892,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime = t->utime; *stime = t->stime; - /* Task is sleeping, nothing to add */ - if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + /* Task is sleeping or idle, nothing to add */ + if (vtime->state < VTIME_SYS) continue; delta = vtime_delta(vtime); /* - * Task runs either in user or kernel space, add pending nohz time to - * the right place. + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. */ - if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += vtime->utime + delta; - else if (vtime->state == VTIME_SYS) + if (vtime->state == VTIME_SYS) *stime += vtime->stime + delta; + else + *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } + +static int vtime_state_check(struct vtime *vtime, int cpu) +{ + /* + * We raced against a context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + return 0; +} + +static u64 kcpustat_user_vtime(struct vtime *vtime) +{ + if (vtime->state == VTIME_USER) + return vtime->utime + vtime_delta(vtime); + else if (vtime->state == VTIME_GUEST) + return vtime->gtime + vtime_delta(vtime); + return 0; +} + +static int kcpustat_field_vtime(u64 *cpustat, + struct task_struct *tsk, + enum cpu_usage_stat usage, + int cpu, u64 *val) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *val = cpustat[usage]; + + /* + * Nice VS unnice cputime accounting may be inaccurate if + * the nice value has changed since the last vtime update. + * But proper fix would involve interrupting target on nice + * updates which is a no go on nohz_full (although the scheduler + * may still interrupt the target if rescheduling is needed...) + */ + switch (usage) { + case CPUTIME_SYSTEM: + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + break; + case CPUTIME_USER: + if (task_nice(tsk) <= 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_NICE: + if (task_nice(tsk) > 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_GUEST: + if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + case CPUTIME_GUEST_NICE: + if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + default: + break; + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return 0; +} + +u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + u64 *cpustat = kcpustat->cpustat; + struct rq *rq; + u64 val; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) + return cpustat[usage]; + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + return cpustat[usage]; + } + + err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); + rcu_read_unlock(); + + if (!err) + return val; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_field); + +static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, + const struct kernel_cpustat *src, + struct task_struct *tsk, int cpu) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + u64 *cpustat; + u64 delta; + + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *dst = *src; + cpustat = dst->cpustat; + + /* Task is sleeping, dead or idle, nothing to add */ + if (vtime->state < VTIME_SYS) + continue; + + delta = vtime_delta(vtime); + + /* + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. + */ + if (vtime->state == VTIME_SYS) { + cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; + } else if (vtime->state == VTIME_USER) { + if (task_nice(tsk) > 0) + cpustat[CPUTIME_NICE] += vtime->utime + delta; + else + cpustat[CPUTIME_USER] += vtime->utime + delta; + } else { + WARN_ON_ONCE(vtime->state != VTIME_GUEST); + if (task_nice(tsk) > 0) { + cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; + cpustat[CPUTIME_NICE] += vtime->gtime + delta; + } else { + cpustat[CPUTIME_GUEST] += vtime->gtime + delta; + cpustat[CPUTIME_USER] += vtime->gtime + delta; + } + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return err; +} + +void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + const struct kernel_cpustat *src = &kcpustat_cpu(cpu); + struct rq *rq; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) { + *dst = *src; + return; + } + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + *dst = *src; + return; + } + + err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); + rcu_read_unlock(); + + if (!err) + return; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 46122edd8552..43323f875cb9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p) dl_se->dl_non_contending = 1; get_task_struct(p); - hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } static void task_contending(struct sched_dl_entity *dl_se, int flags) @@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; + struct dl_bw *dl_b; later_rq = find_lock_later_rq(p, rq); if (!later_rq) { @@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } + if (p->dl.dl_non_contending || p->dl.dl_throttled) { + /* + * Inactive timer is armed (or callback is running, but + * waiting for us to release rq locks). In any case, when it + * will fire (or continue), it will see running_bw of this + * task migrated to later_rq (and correctly handle it). + */ + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); + + add_rq_bw(&p->dl, &later_rq->dl); + add_running_bw(&p->dl, &later_rq->dl); + } else { + sub_rq_bw(&p->dl, &rq->dl); + add_rq_bw(&p->dl, &later_rq->dl); + } + + /* + * And we finally need to fixup root_domain(s) bandwidth accounting, + * since p is still hanging out in the old (now moved to default) root + * domain. + */ + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + + dl_b = &later_rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + set_task_cpu(p, later_rq->cpu); double_unlock_balance(later_rq, rq); @@ -923,7 +956,7 @@ static int start_dl_timer(struct task_struct *p) */ if (!hrtimer_is_queued(timer)) { get_task_struct(p); - hrtimer_start(timer, act, HRTIMER_MODE_ABS); + hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; @@ -1053,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); timer->function = dl_task_timer; } @@ -1292,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); timer->function = inactive_task_timer; } @@ -1658,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1694,12 +1743,23 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + + if (!first) + return; + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); + + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); } static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1713,61 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -static struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - struct dl_rq *dl_rq; - - dl_rq = &rq->dl; - if (need_pull_dl_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_dl_task() can drop (and re-acquire) rq->lock; this - * means a stop task can slip in, in which case we need to - * re-start task selection. - */ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } - - /* - * When prev is DL, we may throttle it in put_prev_task(). - * So, we update time before we check for dl_nr_running. - */ - if (prev->sched_class == &dl_sched_class) - update_curr_dl(rq); - - if (unlikely(!dl_rq->dl_nr_running)) + if (!sched_dl_runnable(rq)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); - - set_next_task(rq, p); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - deadline_queue_push_tasks(rq); - - if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); - + set_next_task_dl(rq, p, true); return p; } @@ -1811,11 +1829,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2275,6 +2288,36 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +void dl_add_task_root_domain(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + struct dl_bw *dl_b; + + rq = task_rq_lock(p, &rf); + if (!dl_task(p)) + goto unlock; + + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + + raw_spin_unlock(&dl_b->lock); + +unlock: + task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + rd->dl_bw.total_bw = 0; + raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -2395,8 +2438,10 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP + .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, @@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..879d3ccf3806 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void) int cpu; sched_debug_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + /* + * Need to reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI or stop_machine. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu); - + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu) } /* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 + * The margin used when comparing utilization with CPU capacity. * * (default: ~20%) */ -static unsigned int capacity_margin = 1280; +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight } } - /* hint to use a 32x32->64 mul */ - fact = (u64)(u32)fact * lw->inv_weight; + fact = mul_u32_u32(fact, lw->inv_weight); while (fact >> 32) { fact >>= 1; @@ -749,7 +748,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -803,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -1188,47 +1186,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - RCU_INIT_POINTER(p->numa_group, NULL); - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = NUMA_NO_NODE; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -1516,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1644,7 +1606,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; @@ -2523,7 +2485,7 @@ static void reset_ptenuma_scan(struct task_struct *p) * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; @@ -2536,7 +2498,7 @@ void task_numa_work(struct callback_head *work) SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2665,6 +2627,50 @@ out: } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + init_task_work(&p->numa_work, task_numa_work); + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ @@ -2693,10 +2699,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + if (!time_before(jiffies, curr->mm->numa_next_scan)) task_work_add(curr, work, true); - } } } @@ -3110,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3362,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3504,9 +3509,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3514,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3555,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3613,11 +3614,15 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq, 0); + } } #ifndef CONFIG_64BIT @@ -3689,8 +3694,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -3708,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3766,10 +3783,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; @@ -3802,12 +3830,13 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: WRITE_ONCE(p->se.avg.util_est, ue); } static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return capacity * 1024 > task_util_est(p) * capacity_margin; + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -3842,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -4355,23 +4384,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4379,22 +4401,12 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4411,65 +4423,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4522,7 +4492,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -4554,7 +4523,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4565,6 +4534,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4574,6 +4544,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4613,7 +4584,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4633,6 +4604,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) enqueue = 0; @@ -4641,6 +4613,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; if (cfs_rq_throttled(cfs_rq)) break; @@ -4656,8 +4629,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4673,13 +4645,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4704,7 +4678,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4732,8 +4706,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4746,8 +4718,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; @@ -4829,8 +4800,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4863,7 +4833,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); unsigned long flags; - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); @@ -4881,7 +4850,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4890,11 +4858,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); - if (expires == cfs_b->runtime_expires) - lsub_positive(&cfs_b->runtime, runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -4990,20 +4957,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -5042,17 +5017,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -5130,11 +5101,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5230,7 +5196,7 @@ static inline unsigned long cpu_util(int cpu); static inline bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -5244,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +#ifdef CONFIG_SMP +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5254,6 +5234,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5286,6 +5267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } @@ -5293,6 +5275,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5354,6 +5337,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5368,6 +5353,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5387,6 +5373,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5398,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5420,26 +5411,45 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -static unsigned long cpu_runnable_load(struct rq *rq) +static unsigned long cpu_load(struct rq *rq) { - return cfs_rq_runnable_load_avg(&rq->cfs); + return cfs_rq_load_avg(&rq->cfs); } -static unsigned long capacity_of(int cpu) +/* + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted + * + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. + */ +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) { - return cpu_rq(cpu)->cpu_capacity; -} + struct cfs_rq *cfs_rq; + unsigned int load; -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); - if (nr_running) - return load_avg / nr_running; + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); - return 0; + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; +} + +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; } static void record_wakee(struct task_struct *p) @@ -5532,7 +5542,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5550,7 +5560,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5588,149 +5598,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - avg_load = 0; - runnable_load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; - this_spare = max_spare_cap; - } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. - */ - if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) - return NULL; - - if (min_runnable_load > (this_runnable_load + imbalance)) - return NULL; - - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5751,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5774,7 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -5794,7 +5667,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -5943,7 +5816,7 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -5971,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, nr = INT_MAX; int this = smp_processor_id(); + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6002,12 +5876,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6027,21 +5901,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (available_idle_cpu(target)) + if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) + return prev; + + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { return prev; + } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - available_idle_cpu(recent_used_cpu) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6277,69 +6166,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - unsigned int max_util, util_cfs, cpu_util, cpu_cap; - unsigned long sum_util, energy = 0; - struct task_struct *tsk; + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; int cpu; - for (; pd; pd = pd->next) { - struct cpumask *pd_mask = perf_domain_span(pd); + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); + struct task_struct *tsk = cpu == dst_cpu ? p : NULL; /* - * The energy model mandates all the CPUs of a performance - * domain have the same capacity. + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. */ - cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - max_util = sum_util = 0; + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); /* - * The capacity state of CPUs of the current rd can be driven by - * CPUs of another rd if they belong to the same performance - * domain. So, account for the utilization of these CPUs too - * by masking pd with cpu_online_mask instead of the rd span. - * - * If an entire performance domain is outside of the current rd, - * it will not appear in its pd list and will not be accounted - * by compute_energy(). + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - util_cfs = cpu_util_next(cpu, p, dst_cpu); - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - tsk = cpu == dst_cpu ? p : NULL; - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - energy += em_pd_energy(pd->em_pd, max_util, sum_util); + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return energy; + return em_pd_energy(pd->em_pd, max_util, sum_util); } /* @@ -6381,21 +6256,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ - static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { - unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + unsigned long cpu_cap, util, base_energy = 0; int cpu, best_energy_cpu = prev_cpu; - struct perf_domain *head, *pd; - unsigned long cpu_cap, util; struct sched_domain *sd; + struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; - head = pd; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6412,31 +6285,44 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) goto unlock; for (; pd; pd = pd->next) { - unsigned long cur_energy, spare_cap, max_spare_cap = 0; + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long base_energy_pd; int max_spare_cap_cpu = -1; + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - if (cpu_cap * 1024 < util * capacity_margin) + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); + if (!fits_capacity(util, cpu_cap)) continue; /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_energy = compute_energy(p, prev_cpu, head); - best_energy = min(best_energy, prev_energy); - continue; + prev_delta = compute_energy(p, prev_cpu, pd); + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); } /* * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; @@ -6444,10 +6330,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { - cur_energy = compute_energy(p, max_spare_cap_cpu, head); - if (cur_energy < best_energy) { - best_energy = cur_energy; + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } @@ -6459,10 +6346,10 @@ unlock: * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_energy == ULONG_MAX) + if (prev_delta == ULONG_MAX) return best_energy_cpu; - if ((prev_energy - best_energy) > (prev_energy >> 4)) + if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) return best_energy_cpu; return prev_cpu; @@ -6616,6 +6503,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6783,7 +6679,7 @@ preempt: set_last_buddy(se); } -static struct task_struct * +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -6792,11 +6688,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6873,8 +6769,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6902,11 +6798,13 @@ done: __maybe_unused; return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + if (!rf) + return NULL; + + new_tasks = newidle_balance(rq, rf); /* - * Because idle_balance() releases (and re-acquires) rq->lock, it is + * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -6925,6 +6823,11 @@ idle: return NULL; } +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); +} + /* * Account for a descheduled task: */ @@ -7114,11 +7017,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * 'group_type' describes the group of CPUs at the moment of load balancing. + * + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the group_type can simply be compared when selecting the busiest + * group. See update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + /* The group has spare capacity that can be used to run more tasks. */ + group_has_spare = 0, + /* + * The group is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + group_fully_busy, + /* + * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity + * and must be migrated to a more powerful CPU. + */ group_misfit_task, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the + * current CPU. + */ + group_asym_packing, + /* + * The tasks' affinity constraints previously prevented the scheduler + * from balancing the load across the system. + */ group_imbalanced, - group_overloaded, + /* + * The CPU is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7151,7 +7092,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; }; @@ -7374,7 +7315,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance runnable load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7382,8 +7323,8 @@ static const unsigned int sched_nr_migrate_break = 32; static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -7416,21 +7357,55 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + switch (env->migration_type) { + case migrate_load: + load = task_h_load(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if ((load / 2) > env->imbalance) - goto next; + /* + * Make sure that we don't migrate too much load. + * Nevertheless, let relax the constraint if + * scheduler fails to find a good waiting task to + * migrate. + */ + if (load/2 > env->imbalance && + env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + goto next; + + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + /* This is not a misfit task */ + if (task_fits_capacity(p, capacity_of(env->src_cpu))) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize @@ -7442,7 +7417,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * runnable load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -7552,6 +7527,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7571,16 +7568,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } -static void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; - const struct sched_class *curr_class; - struct rq_flags rf; - bool done = true; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); + bool decayed = false; + int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -7589,9 +7581,13 @@ static void update_blocked_averages(int cpu) for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq, 0); + if (cfs_rq == &rq->cfs) + decayed = true; + } + /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) @@ -7606,19 +7602,10 @@ static void update_blocked_averages(int cpu) /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) - done = false; + *done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - - update_blocked_load_status(rq, !done); - rq_unlock_irqrestore(rq, &rf); + return decayed; } /* @@ -7668,23 +7655,16 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else -static inline void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - const struct sched_class *curr_class; - struct rq_flags rf; + bool decayed; - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); - rq_unlock_irqrestore(rq, &rf); + return decayed; } static unsigned long task_h_load(struct task_struct *p) @@ -7693,6 +7673,24 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} + /********** Helpers for find_busiest_group ************************/ /* @@ -7701,14 +7699,14 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -7723,10 +7721,10 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -7737,19 +7735,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } @@ -7820,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); - - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } + unsigned long cpu_cap = capacity_of(cpu); - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* @@ -7937,13 +7916,13 @@ static inline int sg_imbalanced(struct sched_group *group) * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7958,13 +7937,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7977,8 +7956,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->min_capacity * capacity_margin < - ref->sgc->min_capacity * 1024; + return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); } /* @@ -7988,24 +7966,30 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity * capacity_margin < - ref->sgc->max_capacity * 1024; + return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(unsigned int imbalance_pct, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(imbalance_pct, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -8042,21 +8026,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_runnable_load(rq); + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8070,9 +8058,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + + if (local_group) + continue; + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -8080,17 +8075,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; + sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -8113,6 +8115,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -8121,7 +8127,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -8130,62 +8136,92 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) - return false; - - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - */ - if (sgs->sum_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - if (!sds->busiest) - return true; + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; - /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus > busiest->idle_cpus) + return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -8210,18 +8246,314 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * task_running_on_cpu - return 1 if @p is running on @cpu. + */ + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return 0; + + if (task_on_rq_queued(p)) + return 1; + + return 0; +} + +/** + * idle_cpu_without - would a given CPU be idle without p ? + * @cpu: the processor on which idleness is tested. + * @p: task which should be ignored. + * + * Return: 1 if the CPU would be idle. 0 otherwise. + */ +static int idle_cpu_without(int cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle && rq->curr != p) + return 0; + + /* + * rq->nr_running can't be used but an updated version without the + * impact of p on cpu must be used instead. The updated nr_running + * be computed and tested before calling idle_cpu_without(). + */ + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; +} + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @sd: The sched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + * @p: The task for which we look for the idlest group/CPU. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + unsigned int local; + + sgs->group_load += cpu_load_without(rq, p); + sgs->group_util += cpu_util_without(i, p); + local = task_running_on_cpu(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + + nr_running = rq->nr_running - local; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu_without() if nr_running is not 0 + */ + if (!nr_running && idle_cpu_without(i, p)) + sgs->idle_cpus++; + + } + + /* Check if task fits in the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + !task_fits_capacity(p, group->sgc->max_capacity)) { + sgs->group_misfit_task_load = 1; + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type < group_fully_busy) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + return false; + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int sd_flag) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* The local group has been skipped because of CPU affinity */ + if (!local) + return idlest; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -8248,22 +8580,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -8272,13 +8588,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { @@ -8309,203 +8627,177 @@ next_group: } /** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) -{ - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = sds->busiest_stat.group_load; - - return 1; -} - -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @env: load balance environment + * @sds: statistics of the sched_domain whose imbalance is to be calculated. */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; - if (!local->sum_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + return; + } - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. + */ + env->migration_type = migrate_task; + env->imbalance = 1; return; } /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. + * Try to use spare capacity of local group without overloading it or + * emptying busiest. */ + if (local->group_type == group_has_spare) { + if (busiest->group_type > group_fully_busy) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } + return; + } - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_nr_running); + env->imbalance = nr_diff >> 1; + } else { - /* Move if we gain throughput */ - if (capa_move > capa_now) - env->imbalance = busiest->load_per_task; -} + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); + } -/** - * calculate_imbalance - Calculate the amount of imbalance present within the - * groups of a given sched_domain during load balance. - * @env: load balance environment - * @sds: statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long max_pull, load_above_capacity = ~0UL; - struct sg_lb_stats *local, *busiest; + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; - local = &sds->local_stat; - busiest = &sds->busiest_stat; + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } - if (busiest->group_type == group_imbalanced) { - /* - * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX - */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); + return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; - return fix_small_imbalance(env, sds); - } + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ - /* - * If there aren't any idle CPUs, avoid creating some. - */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ + /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -8525,7 +8817,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); @@ -8540,17 +8832,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't @@ -8561,55 +8853,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - - /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; + if (local->group_type == group_overloaded) { + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) + goto out_balanced; + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; - if (env->idle == CPU_IDLE) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * Don't pull any tasks if this group is already above the + * domain average load. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= sds.avg_load) goto out_balanced; - } else { + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8625,11 +8942,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -8657,20 +8976,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8680,35 +8987,69 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - load = cpu_runnable_load(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. + */ + load = cpu_load(rq); - /* - * When comparing with imbalance, use cpu_runnable_load() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; + + case migrate_util: + util = cpu_util(cpu_of(rq)); + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; - /* - * For the load comparisons with the other CPU's, consider - * the cpu_runnable_load() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: load_i * capacity_j > load_j * capacity_i; where j is - * our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; } } @@ -8754,7 +9095,7 @@ voluntary_active_balance(struct lb_env *env) return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; return 0; @@ -9047,9 +9388,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) @@ -9070,10 +9412,10 @@ out_one_pinned: ld_moved = 0; /* - * idle_balance() disregards balance intervals, so we could repeatedly - * reach this code, which would lead to balance_interval skyrocketting - * in a short amount of time. Skip the balance_interval increase logic - * to avoid that. + * newidle_balance() disregards balance intervals, so we could + * repeatedly reach this code, which would lead to balance_interval + * skyrocketting in a short amount of time. Skip the balance_interval + * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; @@ -9227,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9263,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9279,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); @@ -9782,8 +10126,13 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. + * + * Returns: + * < 0 - we released the lock and there are !fair tasks present + * 0 - failed, no new tasks + * > 0 - success, new (fair) tasks present */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -9791,6 +10140,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10025,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -10115,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } @@ -10175,9 +10528,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10295,18 +10658,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } @@ -10447,10 +10810,12 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, - .pick_next_task = pick_next_task_fair, + .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -10461,7 +10826,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 2410db5e9a35..7481cd96f391 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) +SCHED_FEAT(UTIL_EST_FASTUP, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..b743bf38f08f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * update no idle residency and return. */ if (current_clr_polling_and_test()) { - dev->last_residency = 0; + dev->last_residency_ns = 0; local_irq_enable(); return -EBUSY; } @@ -158,14 +158,16 @@ static void cpuidle_idle_call(void) /* * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only - * activity happens here and in iterrupts (if any). In that case bypass + * activity happens here and in interrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { + u64 max_latency_ns; + if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -176,12 +178,16 @@ static void cpuidle_idle_call(void) } rcu_idle_exit(); + + max_latency_ns = U64_MAX; + } else { + max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); rcu_idle_enter(); - next_state = cpuidle_find_deepest_state(drv, dev); + next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else { bool stop_tick = true; @@ -238,16 +244,16 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); + local_irq_disable(); + if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick_protected(); + tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } - local_irq_disable(); arch_cpu_idle_enter(); /* @@ -311,7 +317,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_ms) +void play_idle_precise(u64 duration_ns, u64 latency_ns) { struct idle_timer it; @@ -323,28 +329,29 @@ void play_idle(unsigned long duration_ms) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_ms); + WARN_ON_ONCE(!duration_ns); rcu_sleep_check(); preempt_disable(); current->flags |= PF_IDLE; - cpuidle_use_deepest_state(true); + cpuidle_use_deepest_state(latency_ns); it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), + HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); - cpuidle_use_deepest_state(false); + cpuidle_use_deepest_state(0); current->flags &= ~PF_IDLE; preempt_fold_need_resched(); preempt_enable(); } -EXPORT_SYMBOL_GPL(play_idle); +EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { @@ -364,6 +371,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -374,14 +387,23 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { - put_prev_task(rq, prev); update_idle_core(rq); schedstat_inc(rq->sched_goidle); +} - return rq->idle; +struct task_struct *pick_next_task_idle(struct rq *rq) +{ + struct task_struct *next = rq->idle; + + set_next_task_idle(rq, next, true); + + return next; } /* @@ -397,10 +419,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) raw_spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - /* * scheduler tick hitting a task of our scheduling class. * @@ -413,10 +431,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); @@ -451,13 +465,14 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ccb28085b114..008d6ac2342b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled); int housekeeping_any_cpu(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + int cpu; + + if (static_branch_unlikely(&housekeeping_overridden)) { + if (housekeeping_flags & flags) { + cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (cpu < nr_cpu_ids) + return cpu; + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + } + } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); @@ -155,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } + if (!strncmp(str, "managed_irq,", 12)) { + str += 12; + flags |= HK_FLAG_MANAGED_IRQ; + continue; + } + pr_warn("isolcpus: Error, unknown flag\n"); return 0; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..168479a7d61b 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,10 +30,42 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -45,17 +77,11 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,23 +96,28 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p->flags & PF_KTHREAD) + continue; + + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -101,22 +132,22 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -149,22 +174,17 @@ static int membarrier_private_expedited(int flags) */ if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -177,32 +197,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +279,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +295,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +317,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..bd006b79b360 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * Step 2 */ delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); + if (load) { + /* + * This relies on the: + * + * if (!load) + * runnable = running = 0; + * + * clause from ___update_load_sum(); this results in + * the below usage of @contrib to dissapear entirely, + * so no point in calculating it. + */ + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } } sa->period_contrib = delta; @@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages() + * update_blocked_averages(). + * + * Also see the comment in accumulate_sum(). */ if (!load) runnable = running = 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 23fbbcc414d5..028520702717 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -185,7 +185,8 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->avg_next_update = sched_clock() + psi_period; + group->avg_last_update = sched_clock(); + group->avg_next_update = group->avg_last_update + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ @@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) u32 remaining; remaining = win->size - elapsed; - growth += div_u64(win->prev_growth * remaining, win->size); + growth += div64_u64(win->prev_growth * remaining, win->size); } return growth; @@ -1131,7 +1132,15 @@ static void psi_trigger_destroy(struct kref *ref) * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t); @@ -1190,7 +1199,10 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - buf_size = min(nbytes, (sizeof(buf) - 1)); + if (!nbytes) + return -EINVAL; + + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; @@ -1242,39 +1254,41 @@ static int psi_fop_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static const struct file_operations psi_io_fops = { - .open = psi_io_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_io_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_io_proc_ops = { + .proc_open = psi_io_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_io_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_memory_fops = { - .open = psi_memory_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_memory_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_memory_proc_ops = { + .proc_open = psi_memory_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_memory_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_cpu_fops = { - .open = psi_cpu_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_cpu_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_cpu_proc_ops = { + .proc_open = psi_cpu_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_cpu_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + } return 0; } module_init(psi_proc_init); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..4043abe45459 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) * to update the period. */ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); - hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -436,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1390,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1421,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + test = curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1448,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1468,6 +1515,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1498,12 +1561,25 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + + if (!first) + return; + + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1537,58 +1613,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p; - struct rt_rq *rt_rq = &rq->rt; - - if (need_pull_rt_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl or stop task can slip in, in which case we need - * to re-start task selection. - */ - if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || - rq->dl.dl_nr_running)) - return RETRY_TASK; - } - - /* - * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_queued check. - */ - if (prev->sched_class == &rt_sched_class) - update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!sched_rt_runnable(rq)) return NULL; - put_prev_task(rq, prev); - p = _pick_next_task_rt(rq); - - set_next_task(rq, p); - - rt_queue_push_tasks(rq); - - /* - * If prev task was rt, put_prev_task() has already updated the - * utilization. We only care of the case where we start to schedule a - * rt task - */ - if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); - + set_next_task_rt(rq, p, true); return p; } @@ -1614,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; @@ -1656,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ /* @@ -2160,12 +2195,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2237,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) @@ -2304,8 +2344,10 @@ static void watchdog(struct rq *rq, struct task_struct *p) } next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); - if (p->rt.timeout > next) - p->cputime_expires.sched_exp = p->se.sum_exec_runtime; + if (p->rt.timeout > next) { + posix_cputimers_rt_watchdog(&p->posix_cputimers, + p->se.sum_exec_runtime); + } } } #else @@ -2354,11 +2396,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } -static void set_curr_task_rt(struct rq *rq) -{ - set_next_task(rq, rq->curr); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2380,10 +2417,11 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, @@ -2391,7 +2429,6 @@ const struct sched_class rt_sched_class = { .switched_from = switched_from_rt, #endif - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..9ea647835fd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,8 +335,6 @@ struct cfs_bandwidth { u64 quota; u64 runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; u8 idle; u8 period_active; @@ -393,6 +391,16 @@ struct task_group { #endif struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -483,7 +491,8 @@ struct cfs_rq { struct load_weight load; unsigned long runnable_weight; unsigned int nr_running; - unsigned int h_nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; u64 min_vruntime; @@ -556,8 +565,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock; @@ -777,9 +784,6 @@ struct root_domain { struct perf_domain __rcu *pd; }; -extern struct root_domain def_root_domain; -extern struct mutex sched_domains_mutex; - extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); @@ -892,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -907,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1261,16 +1269,18 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -#endif - -#ifdef CONFIG_NUMA extern void sched_init_numa(void); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else static inline void sched_init_numa(void) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} #endif #ifdef CONFIG_NUMA_BALANCING @@ -1449,10 +1459,14 @@ static inline void unregister_sched_domain_sysctl(void) } #endif +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + #else static inline void sched_ttwu_pending(void) { } +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + #endif /* CONFIG_SMP */ #include "stats.h" @@ -1699,20 +1713,13 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); - /* - * It is the responsibility of the pick_next_task() method that will - * return the next task to call put_prev_task() on the @prev task or - * something equivalent. - * - * May return RETRY_TASK when it finds a higher prio class has runnable - * tasks. - */ - struct task_struct * (*pick_next_task)(struct rq *rq, - struct task_struct *prev, - struct rq_flags *rf); + struct task_struct *(*pick_next_task)(struct rq *rq); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -1725,7 +1732,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task)(struct rq *rq); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); void (*task_dead)(struct task_struct *p); @@ -1755,12 +1761,14 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { + WARN_ON_ONCE(rq->curr != prev); prev->sched_class->put_prev_task(rq, prev); } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next) { - curr->sched_class->set_curr_task(rq); + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next, false); } #ifdef CONFIG_SMP @@ -1768,8 +1776,12 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) #else #define sched_class_highest (&dl_sched_class) #endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + #define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) + for_class_range(class, sched_class_highest, NULL) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1777,6 +1789,28 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} + +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +extern struct task_struct *pick_next_task_idle(struct rq *rq); #ifdef CONFIG_SMP @@ -1943,7 +1977,7 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +2029,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2266,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2290,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } - -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return uclamp_util_with(rq, util, NULL); -} #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) -{ - return util; -} -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } @@ -2423,3 +2449,46 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index aa0de240fb41..ba683fe81a6e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { unsigned long long now = rq_clock(rq), delta = 0; - if (unlikely(sched_info_on())) + if (sched_info_on()) { if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; + } sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; @@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) { + if (sched_info_on()) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); } @@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct static inline void sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - if (unlikely(sched_info_on())) + if (sched_info_on()) __sched_info_switch(rq, prev, next); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..4c9e9975684f 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } + +static int +balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return sched_stop_runnable(rq); +} #endif /* CONFIG_SMP */ static void @@ -23,19 +29,18 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first) { - struct task_struct *stop = rq->stop; + stop->se.exec_start = rq_clock_task(rq); +} - if (!stop || !task_on_rq_queued(stop)) +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + if (!sched_stop_runnable(rq)) return NULL; - put_prev_task(rq, prev); - - stop->se.exec_start = rq_clock_task(rq); - - return stop; + set_next_task_stop(rq, rq->stop, true); + return rq->stop; } static void @@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - stop->se.exec_start = rq_clock_task(rq); -} - static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ @@ -128,13 +126,14 @@ const struct sched_class stop_sched_class = { .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP + .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f751ce0b783e..dfb64c08a407 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd, if (!attr || attr->relax_domain_level < 0) { if (default_relax_domain_level < 0) return; - else - request = default_relax_domain_level; + request = default_relax_domain_level; } else request = attr->relax_domain_level; - if (request < sd->level) { + + if (sd->level > request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* Turn on idle balance on this domain: */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } @@ -1284,6 +1281,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1402,7 +1400,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); @@ -1724,6 +1722,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) } } +/* + * sched_numa_find_closest() - given the NUMA topology, find the cpu + * closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: cpu to be close to + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + int i, j = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); + if (cpu < nr_cpu_ids) + return cpu; + } + return nr_cpu_ids; +} + #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -1862,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve } /* + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for + * any two given CPUs at this (non-NUMA) topology level. + */ +static bool topology_span_sane(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, int cpu) +{ + int i; + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + return true; + + /* + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. + */ + for_each_cpu(i, cpu_map) { + if (i == cpu) + continue; + /* + * We should 'and' all those masks with 'cpu_map' to exactly + * match the topology we're about to build, but that can only + * remove CPUs, which only lessens our ability to detect + * overlaps + */ + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && + cpumask_intersects(tl->mask(cpu), tl->mask(i))) + return false; + } + + return true; +} + +/* * Find the sched_domain_topology_level where all CPU capacities are visible * for all CPUs. */ @@ -1927,7 +1981,7 @@ next_level: static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state; + enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -1935,6 +1989,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false; + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; @@ -1954,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att has_asym = true; } + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) + goto error; + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); if (tl == sched_domain_topology) @@ -2005,7 +2065,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (has_asym) - static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + static_branch_inc_cpuslocked(&sched_asym_cpucapacity); if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", @@ -2100,8 +2160,12 @@ int sched_init_domains(const struct cpumask *cpu_map) */ static void detach_destroy_domains(const struct cpumask *cpu_map) { + unsigned int cpu = cpumask_any(cpu_map); int i; + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) + static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); @@ -2149,16 +2213,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * - * Call with hotplug lock held + * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; int i, j, n; int new_topology; - mutex_lock(&sched_domains_mutex); + lockdep_assert_held(&sched_domains_mutex); /* Always unregister in case we don't destroy any domains: */ unregister_sched_domain_sysctl(); @@ -2183,8 +2247,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) + dattrs_equal(dattr_cur, i, dattr_new, j)) { + struct root_domain *rd; + + /* + * This domain won't be destroyed and as such + * its dl_bw->total_bw needs to be cleared. It + * will be recomputed in function + * update_tasks_root_domain(). + */ + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; + dl_clear_root_domain(rd); goto match1; + } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2241,6 +2316,15 @@ match3: ndoms_cur = ndoms_new; register_sched_domain_sysctl(); +} +/* + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); mutex_unlock(&sched_domains_mutex); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c1e566a114ca..ba059fbfc53a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule @@ -183,26 +182,44 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) + void *key) { - int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!wq_head)) return; - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); + __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); +/** + * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue. + * @wq_head: the waitqueue + * @mode: which threads + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs in that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. + */ +void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, + unsigned int mode, void *key) +{ + __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); + /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) { - __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 45eba18a2898..02ce292b9bc0 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int .bit_nr = -1, }, .wq_entry = { + .flags = flags, .private = current, .func = var_wake_function, .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), |