summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c5
-rw-r--r--kernel/sched/core.c152
-rw-r--r--kernel/sched/cpufreq_schedutil.c93
-rw-r--r--kernel/sched/deadline.c143
-rw-r--r--kernel/sched/fair.c144
-rw-r--r--kernel/sched/membarrier.c177
-rw-r--r--kernel/sched/rt.c31
-rw-r--r--kernel/sched/sched.h114
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/sched/topology.c13
10 files changed, 617 insertions, 261 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index a43df5193538..bb4b9fe026a1 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,13 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/security.h>
#include <linux/export.h>
+#include "sched.h"
+
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..bf724c1952ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -508,7 +508,8 @@ void resched_cpu(int cpu)
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- resched_curr(rq);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
- schedstat_inc(rq->ttwu_local);
- schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd->ttwu_wake_remote);
+ __schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq->ttwu_count);
- schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
- * Pairs with the smp_store_release() in finish_lock_switch().
+ * Pairs with the smp_store_release() in finish_task().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
@@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
+ p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -2571,6 +2573,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
#endif /* CONFIG_PREEMPT_NOTIFIERS */
+static inline void prepare_task(struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+ raw_spin_unlock_irq(&rq->lock);
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -2591,7 +2637,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
- prepare_lock_switch(rq, next);
+ prepare_task(next);
prepare_arch_switch(next);
}
@@ -2646,29 +2692,34 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* the scheduled task must drop that reference.
*
* We must observe prev->state before clearing prev->on_cpu (in
- * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * finish_task), otherwise a concurrent wakeup can get prev
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
- /*
- * The membarrier system call requires a full memory barrier
- * after storing to rq->curr, before going back to user-space.
- *
- * TODO: This smp_mb__after_unlock_lock can go away if PPC end
- * up adding a full barrier to switch_mm(), or we should figure
- * out if a smp_mb__after_unlock_lock is really the proper API
- * to use.
- */
- smp_mb__after_unlock_lock();
- finish_lock_switch(rq, prev);
+ finish_task(prev);
+ finish_lock_switch(rq);
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
- if (mm)
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
+ }
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2772,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
@@ -3308,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -3355,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt)
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
- * rq->curr, before returning to user-space. For TSO
- * (e.g. x86), the architecture must provide its own
- * barrier in switch_mm(). For weakly ordered machines
- * for which spin_unlock() acts as a full memory
- * barrier, finish_lock_switch() in common code takes
- * care of this barrier. For weakly ordered machines for
- * which spin_unlock() acts as a RELEASE barrier (only
- * arm64 and PowerPC), arm64 has a full barrier in
- * switch_to(), and PowerPC has
- * smp_mb__after_unlock_lock() before
- * finish_lock_switch().
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
*/
++*switch_count;
@@ -4040,8 +4100,7 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags &
- ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
/*
@@ -4108,6 +4167,9 @@ recheck:
}
if (user) {
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
retval = security_task_setscheduler(p);
if (retval)
return retval;
@@ -4163,7 +4225,8 @@ change:
}
#endif
#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
/*
@@ -4293,6 +4356,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
}
EXPORT_SYMBOL_GPL(sched_setattr);
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -4799,7 +4867,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d6717a3331a1..dd062a1c8cf0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -60,7 +60,8 @@ struct sugov_cpu {
u64 last_update;
/* The fields below are only needed when sharing a policy. */
- unsigned long util;
+ unsigned long util_cfs;
+ unsigned long util_dl;
unsigned long max;
unsigned int flags;
@@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long cfs_max;
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
- cfs_max = arch_scale_cpu_capacity(NULL, cpu);
+ sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ sg_cpu->util_cfs = cpu_util_cfs(rq);
+ sg_cpu->util_dl = cpu_util_dl(rq);
+}
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
+{
+ /*
+ * Ideally we would like to set util_dl as min/guaranteed freq and
+ * util_cfs + util_dl as requested freq. However, cpufreq is not yet
+ * ready for such an interface. So, we only do the latter for now.
+ */
+ return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- unsigned int flags)
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
{
- if (flags & SCHED_CPUFREQ_IOWAIT) {
+ if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
@@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (!sugov_should_update_freq(sg_policy, time))
@@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max, sg_cpu->cpu);
+ sugov_get_util(sg_cpu);
+ max = sg_cpu->max;
+ util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
s64 delta_ns;
/*
- * If the CPU utilization was last updated before the previous
- * frequency update and the time elapsed between the last update
- * of the CPU utilization and the last frequency update is long
- * enough, don't take the CPU into account as it probably is
- * idle now (and clear iowait_boost for it).
+ * If the CFS CPU utilization was last updated before the
+ * previous frequency update and the time elapsed between the
+ * last update of the CPU utilization and the last frequency
+ * update is long enough, reset iowait_boost and util_cfs, as
+ * they are now probably stale. However, still consider the
+ * CPU contribution if it has some DEADLINE utilization
+ * (util_dl).
*/
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false;
- continue;
+ j_sg_cpu->util_cfs = 0;
+ if (j_sg_cpu->util_dl == 0)
+ continue;
}
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
- j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
+ j_util = sugov_aggregate_util(j_sg_cpu);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
@@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max, sg_cpu->cpu);
-
raw_spin_lock(&sg_policy->update_lock);
- sg_cpu->util = util;
- sg_cpu->max = max;
+ sugov_get_util(sg_cpu);
sg_cpu->flags = flags;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- if (flags & SCHED_CPUFREQ_RT_DL)
+ if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time);
@@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
/*
- * For RT and deadline tasks, the schedutil governor shoots the
- * frequency to maximum. Special care must be taken to ensure that this
- * kthread doesn't result in the same behavior.
+ * For RT tasks, the schedutil governor shoots the frequency to maximum.
+ * Special care must be taken to ensure that this kthread doesn't result
+ * in the same behavior.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() function and before that
@@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
+ /*
+ * Fake (unused) bandwidth; workaround to "fix"
+ * priority inheritance.
+ */
+ .sched_runtime = 1000000,
+ .sched_deadline = 10000000,
+ .sched_period = 10000000,
+ };
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
@@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
return PTR_ERR(thread);
}
- ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ ret = sched_setattr_nocheck(thread, &attr);
if (ret) {
kthread_stop(thread);
- pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
return ret;
}
@@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2473736c7616..9bb0e0c412ec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)
#endif
static inline
-void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
}
static inline
-void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
}
+static inline
+void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_running_bw(dl_se->dl_bw, dl_rq);
+}
+
void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
+ BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+
if (task_on_rq_queued(p))
return;
rq = task_rq(p);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
- add_rq_bw(new_bw, &rq->dl);
+ __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __add_rq_bw(new_bw, &rq->dl);
}
/*
@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)
if (dl_se->dl_runtime == 0)
return;
+ if (dl_entity_is_special(dl_se))
+ return;
+
WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)
*/
if (zerolag_time < 0) {
if (dl_task(p))
- sub_running_bw(dl_se->dl_bw, dl_rq);
+ sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD)
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
return;
if (flags & ENQUEUE_MIGRATED)
- add_rq_bw(dl_se->dl_bw, dl_rq);
+ add_rq_bw(dl_se, dl_rq);
if (dl_se->dl_non_contending) {
dl_se->dl_non_contending = 0;
@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* when the "inactive timer" fired).
* So, add it back.
*/
- add_running_bw(dl_se->dl_bw, dl_rq);
+ add_running_bw(dl_se, dl_rq);
}
}
@@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec;
+ u64 delta_exec, scaled_delta_exec;
+ int cpu = cpu_of(rq);
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq)
return;
}
- /* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
-
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
- delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
- dl_se->runtime -= delta_exec;
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ /*
+ * For tasks that participate in GRUB, we implement GRUB-PA: the
+ * spare reclaimed bandwidth is used to clock down frequency.
+ *
+ * For the others, we still need to scale reservation parameters
+ * according to current frequency and CPU maximum capacity.
+ */
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+ scaled_delta_exec = grub_reclaim(delta_exec,
+ rq,
+ &curr->dl);
+ } else {
+ unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+ scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ }
+
+ dl_se->runtime -= scaled_delta_exec;
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
+
+ /* If requested, inform the user about runtime overruns. */
+ if (dl_runtime_exceeded(dl_se) &&
+ (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+ dl_se->dl_overrun = 1;
+
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
@@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
- sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
+ sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
}
@@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
- sub_running_bw(dl_se->dl_bw, &rq->dl);
+ sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
dl_check_constrained_dl(&p->dl);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
- add_running_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
+ add_running_bw(&p->dl, &rq->dl);
}
/*
@@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
__dequeue_task_dl(rq, p, flags);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
}
/*
@@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
*/
raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_unlock(&rq->lock);
}
@@ -2019,11 +2080,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(next_task->dl.dl_bw, &rq->dl);
- sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_running_bw(&next_task->dl, &rq->dl);
+ sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
- add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_rq_bw(&next_task->dl, &later_rq->dl);
+ add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(p->dl.dl_bw, &src_rq->dl);
- sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_running_bw(&p->dl, &src_rq->dl);
+ sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(p->dl.dl_bw, &this_rq->dl);
- add_running_bw(p->dl.dl_bw, &this_rq->dl);
+ add_rq_bw(&p->dl, &this_rq->dl);
+ add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
task_non_contending(p);
if (!task_on_rq_queued(p))
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
/*
* We cannot use inactive_task_timer() to invoke sub_running_bw()
@@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
return;
}
@@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return 0;
+
/* !deadline task may carry old deadline bandwidth */
if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
@@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ /* special dl tasks don't actually use any parameter */
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return true;
+
/* deadline != 0 */
if (attr->sched_deadline == 0)
return false;
@@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..5eb3ffc9be84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
- schedstat_set(se->statistics.wait_start, wait_start);
+ __schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
@@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- schedstat_set(se->statistics.wait_start, delta);
+ __schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- schedstat_set(se->statistics.wait_max,
+ __schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
- schedstat_inc(se->statistics.wait_count);
- schedstat_add(se->statistics.wait_sum, delta);
- schedstat_set(se->statistics.wait_start, 0);
+ __schedstat_inc(se->statistics.wait_count);
+ __schedstat_add(se->statistics.wait_sum, delta);
+ __schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- schedstat_set(se->statistics.sleep_max, delta);
+ __schedstat_set(se->statistics.sleep_max, delta);
- schedstat_set(se->statistics.sleep_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.sleep_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
@@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- schedstat_set(se->statistics.block_max, delta);
+ __schedstat_set(se->statistics.block_max, delta);
- schedstat_set(se->statistics.block_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.block_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
- schedstat_add(se->statistics.iowait_sum, delta);
- schedstat_inc(se->statistics.iowait_count);
+ __schedstat_add(se->statistics.iowait_sum, delta);
+ __schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
@@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- schedstat_set(se->statistics.sleep_start,
+ __schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- schedstat_set(se->statistics.block_start,
+ __schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
+ * a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib;
@@ -4365,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -5689,28 +5685,38 @@ static int wake_wide(struct task_struct *p)
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- * will be) idle.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ * cache-affine and is (or will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
-
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
- if (idle_cpu(this_cpu))
- return true;
+ /*
+ * If this_cpu is idle, it implies the wakeup is from interrupt
+ * context. Only allow the move if cache is shared. Otherwise an
+ * interrupt intensive workload could force all tasks onto one
+ * node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
+ */
+ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return true;
+ return this_cpu;
- return false;
+ return nr_cpumask_bits;
}
-static bool
+static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5724,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load)
- return true;
+ return this_cpu;
this_eff_load -= current_load;
}
@@ -5741,36 +5747,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
- return this_eff_load <= prev_eff_load;
+ return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ int target = nr_cpumask_bits;
- if (sched_feat(WA_IDLE) && !affine)
- affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE))
+ target = wake_affine_idle(this_cpu, prev_cpu, sync);
- if (sched_feat(WA_WEIGHT) && !affine)
- affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+ target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (affine) {
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
- }
+ if (target == nr_cpumask_bits)
+ return prev_cpu;
- return affine;
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ return target;
}
-static inline int task_util(struct task_struct *p);
-static int cpu_util_wake(int cpu, struct task_struct *p);
+static inline unsigned long task_util(struct task_struct *p);
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
{
- return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
}
/*
@@ -5950,7 +5956,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(cpu_rq(i));
- if (load < min_load || (load == min_load && i == this_cpu)) {
+ if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
@@ -6191,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ int i, recent_used_cpu;
if (idle_cpu(target))
return target;
@@ -6202,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev;
+ /* Check a recently used CPU as a potential idle candidate */
+ recent_used_cpu = p->recent_used_cpu;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ idle_cpu(recent_used_cpu) &&
+ cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ /*
+ * Replace recent_used_cpu with prev as it is a potential
+ * candidate for the next wake.
+ */
+ p->recent_used_cpu = prev;
+ return recent_used_cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -6247,7 +6268,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*/
-static int cpu_util(int cpu)
+static unsigned long cpu_util(int cpu)
{
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
@@ -6255,7 +6276,7 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util;
}
-static inline int task_util(struct task_struct *p)
+static inline unsigned long task_util(struct task_struct *p)
{
return p->se.avg.util_avg;
}
@@ -6264,7 +6285,7 @@ static inline int task_util(struct task_struct *p)
* cpu_util_wake: Compute cpu utilization with any contributions from
* the waking task p removed.
*/
-static int cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
unsigned long util, capacity;
@@ -6355,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu)
goto pick_cpu;
- if (wake_affine(affine_sd, p, prev_cpu, sync))
- new_cpu = cpu;
+ new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6370,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (!sd) {
pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
+ }
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
@@ -6449,8 +6472,7 @@ static void task_dead_fair(struct task_struct *p)
}
#endif /* CONFIG_SMP */
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -6492,7 +6514,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
- gran = wakeup_gran(curr, se);
+ gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 9bcbacba82a8..5d0762633639 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,24 +26,110 @@
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
- (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
-static int membarrier_private_expedited(void)
+static int membarrier_global_expedited(void)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
- if (!(atomic_read(&current->mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
- return -EPERM;
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static int membarrier_private_expedited(int flags)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ } else {
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
if (num_online_cpus() == 1)
return 0;
@@ -105,21 +191,69 @@ static int membarrier_private_expedited(void)
return 0;
}
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ if (atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+ /*
+ * For single mm user, single threaded process, we can
+ * simply issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+ * no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+ } else {
+ /*
+ * For multi-mm user threads, we need to ensure all
+ * future scheduler executions will observe the new
+ * thread flag state for this mm.
+ */
+ synchronize_sched();
+ }
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ &mm->membarrier_state);
+ return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ }
+
/*
* We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
- return;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
- &mm->membarrier_state);
+ if (atomic_read(&mm->membarrier_state) & state)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+ &mm->membarrier_state);
+ if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
+ /*
+ * Ensure all future scheduler executions will observe the
+ * new thread flag state for this process.
+ */
+ synchronize_sched();
+ }
+ atomic_or(state, &mm->membarrier_state);
+ return 0;
}
/**
@@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
int cmd_mask = MEMBARRIER_CMD_BITMASK;
if (tick_nohz_full_enabled())
- cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
return cmd_mask;
}
- case MEMBARRIER_CMD_SHARED:
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ case MEMBARRIER_CMD_GLOBAL:
+ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
if (tick_nohz_full_enabled())
return -EINVAL;
if (num_online_cpus() > 1)
synchronize_sched();
return 0;
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ return membarrier_global_expedited();
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited();
+ return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
- membarrier_register_private_expedited();
- return 0;
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
default:
return -EINVAL;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 665ace2fc558..663b2355a3aa 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
+ u64 now = rq_clock_task(rq);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq_clock_task(rq);
+ curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -1907,9 +1908,8 @@ static void push_rt_tasks(struct rq *rq)
* the rt_loop_next will cause the iterator to perform another scan.
*
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- struct root_domain *rd = rq->rd;
int next;
int cpu;
@@ -1985,19 +1985,24 @@ static void tell_cpu_to_push(struct rq *rq)
* Otherwise it is finishing up and an ipi needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rq->rd);
raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start);
- if (cpu >= 0)
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
void rto_push_irq_work_func(struct irq_work *work)
{
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
struct rq *rq;
int cpu;
@@ -2013,18 +2018,20 @@ void rto_push_irq_work_func(struct irq_work *work)
raw_spin_unlock(&rq->lock);
}
- raw_spin_lock(&rq->rd->rto_lock);
+ raw_spin_lock(&rd->rto_lock);
/* Pass the IPI to the next rt overloaded queue */
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rd);
- raw_spin_unlock(&rq->rd->rto_lock);
+ raw_spin_unlock(&rd->rto_lock);
- if (cpu < 0)
+ if (cpu < 0) {
+ sched_put_rd(rd);
return;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -2212,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a212de..fb5fc458547f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ *
+ * SUGOV stands for SchedUtil GOVernor.
+ */
+#define SCHED_FLAG_SUGOV 0x10000000
+
+static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+{
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
+#else
+ return false;
+#endif
+}
+
/*
* Tells if entity @a should preempt entity @b.
*/
static inline bool
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
{
- return dl_time_before(a->deadline, b->deadline);
+ return dl_entity_is_special(a) ||
+ dl_time_before(a->deadline, b->deadline);
}
/*
@@ -665,6 +691,8 @@ extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
@@ -1328,47 +1356,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- *
- * In particular, the load of prev->state in finish_task_switch() must
- * happen before this.
- *
- * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
- */
- smp_store_release(&prev->on_cpu, 0);
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
- raw_spin_unlock_irq(&rq->lock);
-}
-
/*
* wake flags
*/
@@ -1687,17 +1674,17 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
#ifndef arch_scale_freq_capacity
static __always_inline
-unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1711,10 +1698,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq);
}
#else
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
@@ -2096,14 +2090,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
- * That is not guaranteed to happen if the updates are only triggered from CFS,
- * though, because they may not be coming in if RT or deadline tasks are active
- * all the time (or there are RT and DL tasks only).
+ * That is not guaranteed to happen if the updates are only triggered from CFS
+ * and DL, though, because they may not be coming in if only RT tasks are
+ * active all the time (or there are RT tasks only).
*
- * As a workaround for that issue, this function is called by the RT and DL
- * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * As a workaround for that issue, this function is called periodically by the
+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
- * solutions targeted more specifically at RT and DL tasks.
+ * solutions targeted more specifically at RT tasks.
*/
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
@@ -2125,3 +2119,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+ return rq->cfs.avg.util_avg;
+}
+
+#endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baf500d12b7c..8e7b58de61e7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
rq->rq_sched_info.run_delay += delta;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
@@ -48,8 +51,11 @@ static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
#define schedstat_enabled() 0
+#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0)
+#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
+#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 034cbed7f88b..519b024f4e94 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
static int init_rootdomain(struct root_domain *rd)
{
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
OpenPOWER on IntegriCloud