diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/autogroup.c | 7 | ||||
-rw-r--r-- | kernel/sched/core.c | 58 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 19 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 61 | ||||
-rw-r--r-- | kernel/sched/rt.c | 4 | ||||
-rw-r--r-- | kernel/sched/sched.h | 17 |
7 files changed, 56 insertions, 112 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 6be6c575b6cd..2d4ff5353ded 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -2,6 +2,7 @@ /* * Auto-group scheduling implementation: */ +#include <linux/nospec.h> #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; unsigned long shares; - int err; + int err, idx; if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; @@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); - shares = scale_load(sched_prio_to_weight[nice + 20]); + + idx = array_index_nospec(nice + 20, 40); + shares = scale_load(sched_prio_to_weight[idx]); down_write(&ag->lock); err = sched_group_set_shares(ag->tg, shares); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8afd6086f23..092f7c4de903 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,9 @@ */ #include "sched.h" +#include <linux/kthread.h> +#include <linux/nospec.h> + #include <asm/switch_to.h> #include <asm/tlb.h> @@ -874,7 +877,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP @@ -2718,20 +2721,28 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { + switch (prev_state) { + case TASK_DEAD: + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + put_task_struct(prev); + break; - put_task_struct(prev); + case TASK_PARKED: + kthread_park_complete(prev); + break; + } } tick_nohz_task_switch(); @@ -3498,23 +3509,8 @@ static void __sched notrace __schedule(bool preempt) void __noreturn do_task_dead(void) { - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - raw_spin_lock_irq(¤t->pi_lock); - raw_spin_unlock_irq(¤t->pi_lock); - /* Causes final put_task_struct in finish_task_switch(): */ - __set_current_state(TASK_DEAD); + set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -6928,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2b124811947d..e13df951aca7 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -305,7 +305,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (busy && next_f < sg_policy->next_freq && + sg_policy->next_freq != UINT_MAX) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ @@ -396,19 +397,6 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - /* - * For RT tasks, the schedutil governor shoots the frequency to maximum. - * Special care must be taken to ensure that this kthread doesn't result - * in the same behavior. - * - * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() function and before that - * the schedutil governor rejects all other frequency scaling requests. - * - * There is a very rare case though, where the RT thread yields right - * after the work_in_progress flag is cleared. The effects of that are - * neglected for now. - */ kthread_queue_work(&sg_policy->worker, &sg_policy->work); } @@ -631,10 +619,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d1c7bf7c7e5b..e7b3008b85bb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0951d1c58d2f..79f574dba096 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; - unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - numa_migrate_retry = jiffies + interval; - - /* - * Check that the new retry threshold is after the current one. If - * the retry is in the future, it implies that wake_affine has - * temporarily asked NUMA balancing to backoff from placement. - */ - if (numa_migrate_retry > p->numa_migrate_retry) - return; - - /* Safe to try placing the task on the preferred node */ - p->numa_migrate_retry = numa_migrate_retry; + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } -#ifdef CONFIG_NUMA_BALANCING -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ - unsigned long interval; - - if (!static_branch_likely(&sched_numa_balancing)) - return; - - /* If balancing has no preference then continue gathering data */ - if (p->numa_preferred_nid == -1) - return; - - /* - * If the wakeup is not affecting locality then it is neutral from - * the perspective of NUMA balacing so continue gathering data. - */ - if (cpu_to_node(prev_cpu) == cpu_to_node(target)) - return; - - /* - * Temporarily prevent NUMA balancing trying to place waker/wakee after - * wakee has been moved by wake_affine. This will potentially allow - * related tasks to converge and update their data placement. The - * 4 * numa_scan_period is to allow the two-pass filter to migrate - * hot data to the wakers node. - */ - interval = max(sysctl_numa_balancing_scan_delay, - p->numa_scan_period << 2); - p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); - - interval = max(sysctl_numa_balancing_scan_delay, - current->numa_scan_period << 2); - current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); -} -#else -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ -} -#endif - static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; - update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; @@ -7089,7 +7034,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } set_skip_buddy(se); @@ -9847,6 +9792,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; +out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -9855,7 +9801,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; -out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 86b77987435e..7aef6b4e885a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) continue; raw_spin_lock(&rq->lock); + update_rq_clock(rq); + if (rt_rq->rt_time) { u64 runtime; @@ -859,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq_clock_skip_update(rq, false); + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c3deaee7a7a2..15750c222ca2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -static inline void rq_clock_skip_update(struct rq *rq, bool skip) +static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); - if (skip) - rq->clock_update_flags |= RQCF_REQ_SKIP; - else - rq->clock_update_flags &= ~RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttoling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; } struct rq_flags { |