summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c140
-rw-r--r--kernel/sched/cpufreq_schedutil.c14
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/fair.c149
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/psi.c12
6 files changed, 221 insertions, 107 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..7fa8e74ad2ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3486,8 +3486,36 @@ void scheduler_tick(void)
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
@@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work)
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
@@ -3533,13 +3562,18 @@ out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
@@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
@@ -3904,7 +3945,7 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
@@ -3920,6 +3961,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -5102,37 +5146,40 @@ out_unlock:
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
+ kattr->size = min(usize, ksize);
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
-
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -5142,20 +5189,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -5168,25 +5213,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 636ca6f88c8e..867b4bb6d4be 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -40,6 +40,7 @@ struct sugov_policy {
struct task_struct *thread;
bool work_in_progress;
+ bool limits_changed;
bool need_freq_update;
};
@@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->need_freq_update))
+ if (unlikely(sg_policy->limits_changed)) {
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = true;
return true;
+ }
delta_ns = time - sg_policy->last_freq_update_time;
@@ -437,7 +441,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -457,7 +461,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- busy = sugov_cpu_is_busy(sg_cpu);
+ /* Limits may have changed, don't skip frequency update */
+ busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
@@ -831,6 +836,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -879,7 +885,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef5b9f6b1d42..46122edd8552 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2088,17 +2088,13 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(&next_task->dl, &rq->dl);
- sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(&next_task->dl, &later_rq->dl);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
- add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
@@ -2186,11 +2182,7 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(&p->dl, &src_rq->dl);
- sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(&p->dl, &this_rq->dl);
- add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..500f5db0de0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,6 +1086,21 @@ struct numa_group {
unsigned long faults[0];
};
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
@@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
+ struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
period *= shared + 1;
period /= private + shared + 1;
}
+ rcu_read_unlock();
return max(smin, period);
}
@@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
+ struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
@@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
- p->numa_group = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
pid_t task_numa_group_id(struct task_struct *p)
{
- return p->numa_group ? p->numa_group->gid : 0;
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
}
/*
@@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
- if (!p->numa_group)
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
return 0;
- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
+ struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
- if (!p->numa_group)
+ if (!ng)
return 0;
- total_faults = p->numa_group->total_faults;
+ total_faults = ng->total_faults;
if (!total_faults)
return 0;
@@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
- struct numa_group *ng = p->numa_group;
+ struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
@@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
static void task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
- long load;
- long imp = env->p->numa_group ? groupimp : taskimp;
- long moveimp = imp;
int dist = env->dist;
+ long moveimp = imp;
+ long load;
if (READ_ONCE(dst_rq->numa_migrate_on))
return;
@@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- if (cur->numa_group == env->p->numa_group) {
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
- if (cur->numa_group)
+ if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
- if (cur->numa_group && env->p->numa_group)
+ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
@@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p)
.best_imp = 0,
.best_cpu = -1,
};
+ unsigned long taskweight, groupweight;
struct sched_domain *sd;
+ long taskimp, groupimp;
+ struct numa_group *ng;
struct rq *best_rq;
- unsigned long taskweight, groupweight;
int nid, ret, dist;
- long taskimp, groupimp;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group) {
+ if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
@@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
@@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p)
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
@@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
- if (p->numa_group) {
+ if (ng) {
/*
* safe because we can only change our own group
*
@@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p)
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
- p->numa_group->faults[mem_idx] += diff;
- p->numa_group->faults_cpu[mem_idx] += f_diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[mem_idx];
+ ng->faults[mem_idx] += diff;
+ ng->faults_cpu[mem_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
}
}
- if (!p->numa_group) {
+ if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
@@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
- if (p->numa_group) {
- numa_group_count_active_nodes(p->numa_group);
+ if (ng) {
+ numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
@@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int cpu = cpupid_to_cpu(cpupid);
int i;
- if (unlikely(!p->numa_group)) {
+ if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
@@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
goto no_join;
- my_grp = p->numa_group;
+ my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
@@ -2353,13 +2390,24 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
- struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2420,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -2426,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
@@ -4416,6 +4470,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4619,6 +4675,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
@@ -10444,18 +10503,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
- if (p->numa_group) {
- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
+ rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..e4bc4aa739b8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -241,13 +241,14 @@ static void do_idle(void)
check_pgt_cache();
rmb();
+ local_irq_disable();
+
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick_protected();
+ tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
- local_irq_disable();
arch_cpu_idle_enter();
/*
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..6e52b67b420e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (!rcu_access_pointer(group->poll_kworker)) {
struct sched_param param = {
- .sched_priority = MAX_RT_PRIO - 1,
+ .sched_priority = 1,
};
struct kthread_worker *kworker;
@@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_unlock(&group->trigger_lock);
return ERR_CAST(kworker);
}
- sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
kthread_init_delayed_work(&group->poll_work,
psi_poll_work);
rcu_assign_pointer(group->poll_kworker, kworker);
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
OpenPOWER on IntegriCloud