summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.h1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c3
-rw-r--r--kernel/sched/core.c86
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c40
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c98
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c31
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/loadavg.c21
-rw-r--r--kernel/sched/rt.c29
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/stats.h111
-rw-r--r--kernel/sched/swait.c2
-rw-r--r--kernel/sched/wait.c42
19 files changed, 274 insertions, 239 deletions
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 890c95f2587a..ce40c810cd5c 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,7 @@
#include <linux/kref.h>
#include <linux/rwsem.h>
+#include <linux/sched/autogroup.h>
struct autogroup {
/*
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ad64efe41722..a08795e21628 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,8 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index f063a25d4449..53f9558fa925 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,7 +11,8 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/completion.h>
/**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..3b31fc05a0f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,10 +6,15 @@
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/hotplug.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
+#include <linux/rcupdate_wait.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -23,6 +28,9 @@
#include <asm/switch_to.h>
#include <asm/tlb.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched.h"
#include "../workqueue_internal.h"
@@ -978,7 +986,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
return rq;
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
rq = move_queued_task(rq, p, dest_cpu);
@@ -1087,6 +1095,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -1255,10 +1264,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1299,10 +1308,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1486,14 +1495,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ for_each_cpu(dest_cpu, &p->cpus_allowed) {
if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
continue;
if (!cpu_online(dest_cpu))
@@ -1545,10 +1554,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(tsk_cpus_allowed(p));
+ cpu = cpumask_any(&p->cpus_allowed);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1560,7 +1569,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
!cpu_online(cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
@@ -2844,7 +2853,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
if (!mm) {
next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
+ mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
@@ -3207,6 +3216,15 @@ static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
#endif
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
/*
* Print scheduling while atomic bug:
*/
@@ -3269,10 +3287,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
/*
- * Optimization: we know that if all tasks are in
- * the fair class we can call that function directly:
+ * Optimization: we know that if all tasks are in the fair class we can
+ * call that function directly, but only if the @prev task wasn't of a
+ * higher scheduling class, because otherwise those loose the
+ * opportunity to pull in more work from other CPUs.
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ if (likely((prev->sched_class == &idle_sched_class ||
+ prev->sched_class == &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
@@ -5229,6 +5252,9 @@ void sched_show_task(struct task_struct *p)
int ppid;
unsigned long state = p->state;
+ /* Make sure the string lines up properly with the number of task states: */
+ BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
if (!try_get_task_stack(p))
return;
if (state)
@@ -5457,7 +5483,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5557,7 +5583,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf, old_rf;
+ struct rq_flags rf;
int dest_cpu;
/*
@@ -5576,7 +5602,9 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
+ rq_pin_lock(rq, &rf);
update_rq_clock(rq);
+ rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5589,7 +5617,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_pin_lock(rq, &rf);
+ rq_repin_lock(rq, &rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5618,13 +5646,6 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
- /*
- * __migrate_task() may return with a different
- * rq->lock held and a new cookie in 'rf', but we need
- * to preserve rf::clock_update_flags for 'dead_rq'.
- */
- old_rf = rf;
-
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5633,7 +5654,6 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
- rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -6095,7 +6115,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- atomic_inc(&init_mm.mm_count);
+ mmgrab(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
@@ -6816,11 +6836,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -7226,6 +7255,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index e73119013c53..fba235c7d026 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..54c577578da6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
#include <linux/cpufreq.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
@@ -35,6 +36,7 @@ struct sugov_policy {
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
+ unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
@@ -51,7 +53,6 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
- unsigned int cached_raw_freq;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;
@@ -115,7 +116,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -135,19 +136,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
- unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
return sg_policy->next_freq;
- sg_cpu->cached_raw_freq = freq;
+ sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -212,7 +212,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
} else {
sugov_get_util(&util, &max);
sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_cpu, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
}
sugov_update_commit(sg_policy, time, next_f);
}
@@ -266,7 +266,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(sg_cpu, util, max);
+ return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -579,25 +579,19 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- if (policy_is_shared(policy)) {
- sg_cpu->util = 0;
- sg_cpu->max = 0;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
- sg_cpu->last_update = 0;
- sg_cpu->cached_raw_freq = 0;
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_shared);
- } else {
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_single);
- }
+ sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
}
return 0;
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11e9705bf937..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2ecec3a4f1ee..f3778e2b46c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,12 +4,8 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
#include "sched.h"
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27737f34757d..a2ce59015642 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
update_dl_migration(dl_rq);
@@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
- cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) {
/*
* Fail to find any suitable cpu.
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
- act = ns_to_ktime(dl_se->deadline);
+ act = ns_to_ktime(dl_next_period(dl_se));
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
+ update_rq_clock(rq);
/*
* Now that the task has been migrated to the new RQ and we
@@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
timer->function = dl_task_timer;
}
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ }
+}
+
static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
@@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
+static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline < dl_se->dl_period;
+}
+
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ /*
* If p is throttled, we do nothing. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
@@ -958,7 +1009,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_dl_entity(&p->dl, pi_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1032,9 +1083,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* try to make it stay here, it might be important.
*/
if (unlikely(dl_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
- (tsk_nr_cpus_allowed(p) > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -1055,7 +1106,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
return;
@@ -1063,7 +1114,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1 &&
+ if (p->nr_cpus_allowed != 1 &&
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
@@ -1178,7 +1229,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1235,7 +1286,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1279,7 +1330,7 @@ static int find_later_rq(struct task_struct *task)
if (unlikely(!later_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1;
/*
@@ -1384,8 +1435,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@@ -1425,7 +1475,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
@@ -1464,7 +1514,7 @@ retry:
*/
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
- tsk_nr_cpus_allowed(rq->curr) > 1) {
+ rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
}
@@ -1611,9 +1661,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
@@ -1727,7 +1777,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 109adc0e9cb9..38f019324f1a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -11,7 +11,8 @@
*/
#include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..dea138964b91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,9 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/topology.h>
+
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
@@ -1551,7 +1553,7 @@ static void task_numa_compare(struct task_numa_env *env,
*/
if (cur) {
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
/*
@@ -1661,7 +1663,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
@@ -5458,7 +5460,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
- tsk_cpus_allowed(p)))
+ &p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5578,7 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return cpumask_first(sched_group_cpus(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5717,7 +5719,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
@@ -5751,7 +5753,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
return cpu;
@@ -5797,13 +5799,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if ((avg_idle / 512) < avg_cost)
+ if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
break;
@@ -5958,7 +5960,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ && cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6698,7 +6700,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -6718,7 +6720,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7252,7 +7254,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_allowed constraints.
*
* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@ -8211,8 +8213,7 @@ more_balance:
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
- if (!cpumask_test_cpu(this_cpu,
- tsk_cpus_allowed(busiest->curr))) {
+ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..1b3c8189b286 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -51,6 +51,11 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_AVG_CPU, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6a4bae0a649d..ac6d5176463d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -2,6 +2,7 @@
* Generic entry point for the idle threads
*/
#include <linux/sched.h>
+#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..f15fb2bdbc0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -7,6 +7,7 @@
*/
#include <linux/export.h>
+#include <linux/sched/loadavg.h>
#include "sched.h"
@@ -168,7 +169,7 @@ static inline int calc_load_write_idx(void)
* If the folding window started, make sure we start writing in the
* next idle-delta.
*/
- if (!time_before(jiffies, calc_load_update))
+ if (!time_before(jiffies, READ_ONCE(calc_load_update)))
idx++;
return idx & 1;
@@ -201,8 +202,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = READ_ONCE(calc_load_update);
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +213,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
@@ -307,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,
*/
static void calc_global_nohz(void)
{
+ unsigned long sample_window;
long delta, active, n;
- if (!time_before(jiffies, calc_load_update + 10)) {
+ sample_window = READ_ONCE(calc_load_update);
+ if (!time_before(jiffies, sample_window + 10)) {
/*
* Catch-up, fold however many we are behind still
*/
- delta = jiffies - calc_load_update - 10;
+ delta = jiffies - sample_window - 10;
n = 1 + (delta / LOAD_FREQ);
active = atomic_long_read(&calc_load_tasks);
@@ -323,7 +326,7 @@ static void calc_global_nohz(void)
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
}
/*
@@ -351,9 +354,11 @@ static inline void calc_global_nohz(void) { }
*/
void calc_global_load(unsigned long ticks)
{
+ unsigned long sample_window;
long active, delta;
- if (time_before(jiffies, calc_load_update + 10))
+ sample_window = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, sample_window + 10))
return;
/*
@@ -370,7 +375,7 @@ void calc_global_load(unsigned long ticks)
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- calc_load_update += LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
/*
* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e8836cfc4cdb..9f3e40226dec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -335,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -352,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1591,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1726,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -1762,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
@@ -2122,9 +2121,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -2197,7 +2196,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71b10a9b73cf..5cbf92214ad8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,9 +1,26 @@
#include <linux/sched.h>
+#include <linux/sched/autogroup.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
#include <linux/sched/rt.h>
-#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/init.h>
+
+#include <linux/u64_stats_sync.h>
#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
@@ -13,6 +30,10 @@
#include <linux/tick.h>
#include <linux/slab.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
#include "cpupri.h"
#include "cpudeadline.h"
#include "cpuacct.h"
@@ -1817,7 +1838,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index bf0da0aa0a14..d5710651043b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -164,114 +164,3 @@ sched_info_switch(struct rq *rq,
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick. None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
- *
- * @tsk: Pointer to target task.
- */
-#ifdef CONFIG_POSIX_TIMERS
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- /* Check if cputimer isn't running. This is accessed without locking. */
- if (!READ_ONCE(cputimer->running))
- return NULL;
-
- /*
- * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
- * in __exit_signal(), we won't account to the signal struct further
- * cputime consumed by that task, even though the task can still be
- * ticking after __exit_signal().
- *
- * In order to keep a consistent behaviour between thread group cputime
- * and thread group cputimer accounting, lets also ignore the cputime
- * elapsing after __exit_signal() in any thread group timer running.
- *
- * This makes sure that POSIX CPU clocks and timers are synchronized, so
- * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
- * clock delta is behind the expiring timer value.
- */
- if (unlikely(!tsk->sighand))
- return NULL;
-
- return cputimer;
-}
-#else
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- return NULL;
-}
-#endif
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the utime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the stime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @ns: Time value by which to increment the sum_exec_runtime field
- * of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
- unsigned long long ns)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 82f0dff90030..3d5610dcce11 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/swait.h>
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9453efe9b25a..b8c84c6dee64 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -5,7 +5,8 @@
*/
#include <linux/init.h>
#include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
@@ -241,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_event);
+/*
+ * Note! These two wait functions are entered with the
+ * wait-queue lock held (and interrupts off in the _irq
+ * case), so there is no race with testing the wakeup
+ * condition in the caller before they add the wait
+ * entry to the wake queue.
+ */
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+ if (likely(list_empty(&wait->task_list)))
+ __add_wait_queue_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock(&wq->lock);
+ schedule();
+ spin_lock(&wq->lock);
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr);
+
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+ if (likely(list_empty(&wait->task_list)))
+ __add_wait_queue_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock_irq(&wq->lock);
+ schedule();
+ spin_lock_irq(&wq->lock);
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr_irq);
+
/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
OpenPOWER on IntegriCloud