diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/autogroup.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 152 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 93 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 143 | ||||
-rw-r--r-- | kernel/sched/fair.c | 144 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 177 | ||||
-rw-r--r-- | kernel/sched/rt.c | 31 | ||||
-rw-r--r-- | kernel/sched/sched.h | 114 | ||||
-rw-r--r-- | kernel/sched/stats.h | 6 | ||||
-rw-r--r-- | kernel/sched/topology.c | 13 |
10 files changed, 617 insertions, 261 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,13 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/security.h> #include <linux/export.h> +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..bf724c1952ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { - schedstat_inc(rq->ttwu_local); - schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(rq->ttwu_local); + __schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd->ttwu_wake_remote); + __schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq->ttwu_count); - schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(rq->ttwu_count); + __schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * - * Pairs with the smp_store_release() in finish_lock_switch(). + * Pairs with the smp_store_release() in finish_task(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. @@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); @@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ + p->recent_used_cpu = task_cpu(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -2571,6 +2573,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2591,7 +2637,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); + prepare_task(next); prepare_arch_switch(next); } @@ -2646,29 +2692,34 @@ static struct rq *finish_task_switch(struct task_struct *prev) * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev + * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); - /* - * The membarrier system call requires a full memory barrier - * after storing to rq->curr, before going back to user-space. - * - * TODO: This smp_mb__after_unlock_lock can go away if PPC end - * up adding a full barrier to switch_mm(), or we should figure - * out if a smp_mb__after_unlock_lock is really the proper API - * to use. - */ - smp_mb__after_unlock_lock(); - finish_lock_switch(rq, prev); + finish_task(prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); - if (mm) + /* + * When switching through a kernel thread, the loop in + * membarrier_{private,global}_expedited() may have observed that + * kernel thread and not issued an IPI. It is therefore possible to + * schedule between user->kernel->user threads without passing though + * switch_mm(). Membarrier requires a barrier after storing to + * rq->curr, before returning to userspace, so provide them here: + * + * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. + */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); + } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2772,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); + /* + * If mm is non-NULL, we pass through switch_mm(). If mm is + * NULL, we will pass through mmdrop() in finish_task_switch(). + * Both of these contain the full memory barrier required by + * membarrier after storing to rq->curr, before returning to + * user-space. + */ if (!mm) { next->active_mm = oldmm; mmgrab(oldmm); @@ -3308,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt) * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). + * + * The membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -3355,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt) /* * The membarrier system call requires each architecture * to have a full memory barrier after updating - * rq->curr, before returning to user-space. For TSO - * (e.g. x86), the architecture must provide its own - * barrier in switch_mm(). For weakly ordered machines - * for which spin_unlock() acts as a full memory - * barrier, finish_lock_switch() in common code takes - * care of this barrier. For weakly ordered machines for - * which spin_unlock() acts as a RELEASE barrier (only - * arm64 and PowerPC), arm64 has a full barrier in - * switch_to(), and PowerPC has - * smp_mb__after_unlock_lock() before - * finish_lock_switch(). + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), */ ++*switch_count; @@ -4040,8 +4100,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; /* @@ -4108,6 +4167,9 @@ recheck: } if (user) { + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -4163,7 +4225,8 @@ change: } #endif #ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; /* @@ -4293,6 +4356,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) } EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -4799,7 +4867,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d6717a3331a1..dd062a1c8cf0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -60,7 +60,8 @@ struct sugov_cpu { u64 last_update; /* The fields below are only needed when sharing a policy. */ - unsigned long util; + unsigned long util_cfs; + unsigned long util_dl; unsigned long max; unsigned int flags; @@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long cfs_max; + struct rq *rq = cpu_rq(sg_cpu->cpu); - cfs_max = arch_scale_cpu_capacity(NULL, cpu); + sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->util_cfs = cpu_util_cfs(rq); + sg_cpu->util_dl = cpu_util_dl(rq); +} - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ + /* + * Ideally we would like to set util_dl as min/guaranteed freq and + * util_cfs + util_dl as requested freq. However, cpufreq is not yet + * ready for such an interface. So, we only do the latter for now. + */ + return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) { - if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max, sg_cpu->cpu); + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) s64 delta_ns; /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). + * If the CFS CPU utilization was last updated before the + * previous frequency update and the time elapsed between the + * last update of the CPU utilization and the last frequency + * update is long enough, reset iowait_boost and util_cfs, as + * they are now probably stale. However, still consider the + * CPU contribution if it has some DEADLINE utilization + * (util_dl). */ delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - continue; + j_sg_cpu->util_cfs = 0; + if (j_sg_cpu->util_dl == 0) + continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; - j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; + j_util = sugov_aggregate_util(j_sg_cpu); if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max, sg_cpu->cpu); - raw_spin_lock(&sg_policy->update_lock); - sg_cpu->util = util; - sg_cpu->max = max; + sugov_get_util(sg_cpu); sg_cpu->flags = flags; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_RT) next_f = sg_policy->policy->cpuinfo.max_freq; else next_f = sugov_next_freq_shared(sg_cpu, time); @@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For RT and deadline tasks, the schedutil governor shoots the - * frequency to maximum. Special care must be taken to ensure that this - * kthread doesn't result in the same behavior. + * For RT tasks, the schedutil governor shoots the frequency to maximum. + * Special care must be taken to ensure that this kthread doesn't result + * in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is * updated only at the end of the sugov_work() function and before that @@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) return PTR_ERR(thread); } - ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + ret = sched_setattr_nocheck(thread, &attr); if (ret) { kthread_stop(thread); - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); return ret; } @@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736c7616..9bb0e0c412ec 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) #endif static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_running_bw(dl_se->dl_bw, dl_rq); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; + BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + if (task_on_rq_queued(p)) return; rq = task_rq(p); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); - add_rq_bw(new_bw, &rq->dl); + __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); } /* @@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; + if (dl_entity_is_special(dl_se)) + return; + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); @@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p) */ if (zerolag_time < 0) { if (dl_task(p)) - sub_running_bw(dl_se->dl_bw, dl_rq); + sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); @@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) return; if (flags & ENQUEUE_MIGRATED) - add_rq_bw(dl_se->dl_bw, dl_rq); + add_rq_bw(dl_se, dl_rq); if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; @@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * when the "inactive timer" fired). * So, add it back. */ - add_running_bw(dl_se->dl_bw, dl_rq); + add_running_bw(dl_se, dl_rq); } } @@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec; + u64 delta_exec, scaled_delta_exec; + int cpu = cpu_of(rq); if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_DL); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); - dl_se->runtime -= delta_exec; + if (dl_entity_is_special(dl_se)) + return; + + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. + * + * For the others, we still need to scale reservation parameters + * according to current frequency and CPU maximum capacity. + */ + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { + scaled_delta_exec = grub_reclaim(delta_exec, + rq, + &curr->dl); + } else { + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + scaled_delta_exec = cap_scale(delta_exec, scale_freq); + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + + dl_se->runtime -= scaled_delta_exec; throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; + + /* If requested, inform the user about runtime overruns. */ + if (dl_runtime_exceeded(dl_se) && + (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) + dl_se->dl_overrun = 1; + __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD && dl_se->dl_non_contending) { - sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); - sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); + sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - sub_running_bw(dl_se->dl_bw, &rq->dl); + sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: task_rq_unlock(rq, p, &rf); @@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(p->dl.dl_bw, &rq->dl); - add_running_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); + add_running_bw(&p->dl, &rq->dl); } /* @@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) __dequeue_task_dl(rq, p, flags); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(p->dl.dl_bw, &rq->dl); - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); } /* @@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p) */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -2019,11 +2080,11 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(next_task->dl.dl_bw, &rq->dl); - sub_rq_bw(next_task->dl.dl_bw, &rq->dl); + sub_running_bw(&next_task->dl, &rq->dl); + sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); - add_running_bw(next_task->dl.dl_bw, &later_rq->dl); + add_rq_bw(&next_task->dl, &later_rq->dl); + add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(p->dl.dl_bw, &src_rq->dl); - sub_rq_bw(p->dl.dl_bw, &src_rq->dl); + sub_running_bw(&p->dl, &src_rq->dl); + sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(p->dl.dl_bw, &this_rq->dl); - add_running_bw(p->dl.dl_bw, &this_rq->dl); + add_rq_bw(&p->dl, &this_rq->dl); + add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) task_non_contending(p); if (!task_on_rq_queued(p)) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); /* * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { - add_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); return; } @@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return 0; + /* !deadline task may carry old deadline bandwidth */ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; @@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + /* special dl tasks don't actually use any parameter */ + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return true; + /* deadline != 0 */ if (attr->sched_deadline == 0) return false; @@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4d..5eb3ffc9be84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(se->statistics.wait_start, wait_start); } static inline void @@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - schedstat_set(se->statistics.wait_max, + __schedstat_set(se->statistics.wait_max, max(schedstat_val(se->statistics.wait_max), delta)); - schedstat_inc(se->statistics.wait_count); - schedstat_add(se->statistics.wait_sum, delta); - schedstat_set(se->statistics.wait_start, 0); + __schedstat_inc(se->statistics.wait_count); + __schedstat_add(se->statistics.wait_sum, delta); + __schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - schedstat_set(se->statistics.sleep_max, delta); + __schedstat_set(se->statistics.sleep_max, delta); - schedstat_set(se->statistics.sleep_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.sleep_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.block_max))) - schedstat_set(se->statistics.block_max, delta); + __schedstat_set(se->statistics.block_max, delta); - schedstat_set(se->statistics.block_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.block_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - schedstat_add(se->statistics.iowait_sum, delta); - schedstat_inc(se->statistics.iowait_count); + __schedstat_add(se->statistics.iowait_sum, delta); + __schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - schedstat_set(se->statistics.sleep_start, + __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); if (tsk->state & TASK_UNINTERRUPTIBLE) - schedstat_set(se->statistics.block_start, + __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. + * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization @@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_freq = arch_scale_freq_capacity(cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta += sa->period_contrib; @@ -4365,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -5689,28 +5685,38 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ - -static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync) { - if (idle_cpu(this_cpu)) - return true; + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + * + * If the prev_cpu is idle and cache affine then avoid a migration. + * There is no guarantee that the cache hot data from an interrupt + * is more important than cache hot data on the prev_cpu and from + * a cpufreq perspective, it's better to have higher utilisation + * on one CPU. + */ + if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) - return true; + return this_cpu; - return false; + return nr_cpumask_bits; } -static bool +static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5724,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long current_load = task_h_load(current); if (current_load > this_eff_load) - return true; + return this_cpu; this_eff_load -= current_load; } @@ -5741,36 +5747,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load; + return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + int target = nr_cpumask_bits; - if (sched_feat(WA_IDLE) && !affine) - affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE)) + target = wake_affine_idle(this_cpu, prev_cpu, sync); - if (sched_feat(WA_WEIGHT) && !affine) - affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) + target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (affine) { - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - } + if (target == nr_cpumask_bits) + return prev_cpu; - return affine; + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + return target; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); } /* @@ -5950,7 +5956,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(cpu_rq(i)); - if (load < min_load || (load == min_load && i == this_cpu)) { + if (load < min_load) { min_load = load; least_loaded_cpu = i; } @@ -6191,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + int i, recent_used_cpu; if (idle_cpu(target)) return target; @@ -6202,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; + /* Check a recently used CPU as a potential idle candidate */ + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + idle_cpu(recent_used_cpu) && + cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake. + */ + p->recent_used_cpu = prev; + return recent_used_cpu; + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6247,7 +6268,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -6255,7 +6276,7 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p) { return p->se.avg.util_avg; } @@ -6264,7 +6285,7 @@ static inline int task_util(struct task_struct *p) * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { unsigned long util, capacity; @@ -6355,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - if (wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; + new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { @@ -6370,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_affine) + current->recent_used_cpu = cpu; + } } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } @@ -6449,8 +6472,7 @@ static void task_dead_fair(struct task_struct *p) } #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6492,7 +6514,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr, se); + gran = wakeup_gran(se); if (vdiff > gran) return 1; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 9bcbacba82a8..5d0762633639 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -26,24 +26,110 @@ * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static int membarrier_private_expedited(void) +static int membarrier_global_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; - if (!(atomic_read(¤t->mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) - return -EPERM; + if (num_online_cpus() == 1) + return 0; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static int membarrier_private_expedited(int flags) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) + return -EPERM; + } else { + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + } if (num_online_cpus() == 1) return 0; @@ -105,21 +191,69 @@ static int membarrier_private_expedited(void) return 0; } -static void membarrier_register_private_expedited(void) +static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + if (atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) + return 0; + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); + if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + /* + * For single mm user, single threaded process, we can + * simply issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that + * no memory access following registration is reordered + * before registration. + */ + smp_mb(); + } else { + /* + * For multi-mm user threads, we need to ensure all + * future scheduler executions will observe the new + * thread flag state for this mm. + */ + synchronize_sched(); + } + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + &mm->membarrier_state); + return 0; +} + +static int membarrier_register_private_expedited(int flags) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } + /* * We need to consider threads belonging to different thread * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) - return; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, - &mm->membarrier_state); + if (atomic_read(&mm->membarrier_state) & state) + return 0; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); + if (flags & MEMBARRIER_FLAG_SYNC_CORE) + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, + &mm->membarrier_state); + if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + /* + * Ensure all future scheduler executions will observe the + * new thread flag state for this process. + */ + synchronize_sched(); + } + atomic_or(state, &mm->membarrier_state); + return 0; } /** @@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) int cmd_mask = MEMBARRIER_CMD_BITMASK; if (tick_nohz_full_enabled()) - cmd_mask &= ~MEMBARRIER_CMD_SHARED; + cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; return cmd_mask; } - case MEMBARRIER_CMD_SHARED: - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + case MEMBARRIER_CMD_GLOBAL: + /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) synchronize_sched(); return 0; + case MEMBARRIER_CMD_GLOBAL_EXPEDITED: + return membarrier_global_expedited(); + case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(); + return membarrier_private_expedited(0); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: - membarrier_register_private_expedited(); - return 0; + return membarrier_register_private_expedited(0); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 665ace2fc558..663b2355a3aa 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; + u64 now = rq_clock_task(rq); u64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1907,9 +1908,8 @@ static void push_rt_tasks(struct rq *rq) * the rt_loop_next will cause the iterator to perform another scan. * */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd) { - struct root_domain *rd = rq->rd; int next; int cpu; @@ -1985,19 +1985,24 @@ static void tell_cpu_to_push(struct rq *rq) * Otherwise it is finishing up and an ipi needs to be sent. */ if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rq->rd); raw_spin_unlock(&rq->rd->rto_lock); rto_start_unlock(&rq->rd->rto_loop_start); - if (cpu >= 0) + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } } /* Called from hardirq context */ void rto_push_irq_work_func(struct irq_work *work) { + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); struct rq *rq; int cpu; @@ -2013,18 +2018,20 @@ void rto_push_irq_work_func(struct irq_work *work) raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rq->rd->rto_lock); + raw_spin_lock(&rd->rto_lock); /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rd); - raw_spin_unlock(&rq->rd->rto_lock); + raw_spin_unlock(&rd->rto_lock); - if (cpu < 0) + if (cpu < 0) { + sched_put_rd(rd); return; + } /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ @@ -2212,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..fb5fc458547f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + /* * Tells if entity @a should preempt entity @b. */ static inline bool dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { - return dl_time_before(a->deadline, b->deadline); + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); } /* @@ -665,6 +691,8 @@ extern struct mutex sched_domains_mutex; extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); @@ -1328,47 +1356,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - /* * wake flags */ @@ -1687,17 +1674,17 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1711,10 +1698,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); sched_avg_update(rq); } #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif @@ -2096,14 +2090,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks. */ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { @@ -2125,3 +2119,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + return rq->cfs.avg.util_avg; +} + +#endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baf500d12b7c..8e7b58de61e7 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) rq->rq_sched_info.run_delay += delta; } #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) @@ -48,8 +51,11 @@ static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} #define schedstat_enabled() 0 +#define __schedstat_inc(var) do { } while (0) #define schedstat_inc(var) do { } while (0) +#define __schedstat_add(var, amt) do { } while (0) #define schedstat_add(var, amt) do { } while (0) +#define __schedstat_set(var, val) do { } while (0) #define schedstat_set(var, val) do { } while (0) #define schedstat_val(var) 0 #define schedstat_val_or_zero(var) 0 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 034cbed7f88b..519b024f4e94 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } +void sched_get_rd(struct root_domain *rd) +{ + atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ + if (!atomic_dec_and_test(&rd->refcount)) + return; + + call_rcu_sched(&rd->rcu, free_rootdomain); +} + static int init_rootdomain(struct root_domain *rd) { if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) |