diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 43 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/debug.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 341 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/idle.c | 15 | ||||
-rw-r--r-- | kernel/sched/loadavg.c | 139 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 8 | ||||
-rw-r--r-- | kernel/sched/pelt.h | 2 | ||||
-rw-r--r-- | kernel/sched/psi.c | 759 | ||||
-rw-r--r-- | kernel/sched/sched.h | 207 | ||||
-rw-r--r-- | kernel/sched/stats.h | 86 | ||||
-rw-r--r-- | kernel/sched/topology.c | 111 |
14 files changed, 1419 insertions, 303 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..fd2fce8a001b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it... */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif + s64 __maybe_unused steal = 0, irq_delta = 0; + #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; @@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif @@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; + p->se.runnable_weight = load->weight; return; } @@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; + p->se.runnable_weight = load->weight; } } @@ -721,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -732,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -1167,7 +1172,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); @@ -2036,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -2915,10 +2921,10 @@ unsigned long nr_iowait(void) } /* - * Consumers of these two interfaces, like for example the cpufreq menu - * governor are using nonsensical data. Boosting frequency for a CPU that has - * IO-wait which might not even end up running the task when it does become - * runnable. + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. */ unsigned long nr_iowait_cpu(int cpu) @@ -3050,6 +3056,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -4932,9 +4939,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -5243,7 +5248,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5254,16 +5259,16 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif @@ -6068,6 +6073,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ out: return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 60caf1fb94e0..6383aa6a60ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - static_key_disable(&sched_feat_keys[i]); + static_key_disable_cpuslocked(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - static_key_enable(&sched_feat_keys[i]); + static_key_enable_cpuslocked(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); + cpus_read_lock(); inode_lock(inode); ret = sched_feat_set(cmp); inode_unlock(inode); + cpus_read_unlock(); if (ret < 0) return ret; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..ee271bb661cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1392,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; @@ -1446,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1454,8 +1464,6 @@ struct numa_stats { /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - - unsigned int nr_running; }; /* @@ -1463,36 +1471,16 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); - - cpus++; } - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); } struct task_numa_env { @@ -1514,6 +1502,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1553,6 +1556,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, } /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1579,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1595,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1625,16 +1638,22 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* * In the overloaded case, try and keep the load balanced. */ load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1729,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1805,20 +1825,17 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2613,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2609,6 +2659,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -3362,6 +3416,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach + * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. @@ -3646,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3675,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -3924,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); } @@ -4399,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us + * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is + * not running add to the tail so that later runqueues don't get starved. */ - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_b->distribute_running) + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + else + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); /* * If we're the first throttled task, make sure the bandwidth @@ -4545,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * in us over-using our runtime if it is all used during this loop, but * only by limited amounts in that extreme case. */ - while (throttled && cfs_b->runtime > 0) { + while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; + cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); raw_spin_lock(&cfs_b->lock); + cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4663,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); + if (cfs_b->distribute_running) { + raw_spin_unlock(&cfs_b->lock); + return; + } + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock(&cfs_b->lock); return; @@ -4672,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) runtime = cfs_b->runtime; expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); if (!runtime) @@ -4682,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4790,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; + cfs_b->distribute_running = 0; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -6187,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; @@ -6197,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6274,7 +6372,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6327,6 +6425,8 @@ static void migrate_task_rq_fair(struct task_struct *p) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -6614,9 +6714,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6822,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6852,6 +6962,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7232,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif @@ -7263,6 +7374,7 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; + const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7299,8 +7411,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7365,13 +7479,16 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; + const struct sched_class *curr_class; struct rq_flags rf; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7389,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7410,6 +7521,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7482,10 +7594,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -7507,7 +7619,7 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(cpu); + unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); @@ -7518,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7538,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7568,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7581,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7682,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7702,6 +7830,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7734,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7776,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ @@ -7811,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7830,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) + return false; + + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: @@ -7901,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) @@ -7979,8 +8132,8 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -8130,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8165,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8231,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8268,8 +8432,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); - return sds.busiest; + return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; @@ -8315,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* @@ -8384,6 +8573,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9026,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } @@ -9395,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); @@ -9557,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* @@ -9638,7 +9832,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..858589b83377 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 16f84142f2f4..f5516bae0c1b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle); void cpu_startup_entry(enum cpuhp_state state) { - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (ARM and SH have never invoked the canary - * init for the non boot CPUs!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. */ static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { - unsigned long newload; + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; + return result; +} - return newload / FIXED_1; +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } #ifdef CONFIG_NO_HZ_COMMON @@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..90fb5bc12ad4 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; @@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { @@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index d2894db28955..7e56b489ff32 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else static inline int diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..7cdecfc010af --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,759 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner <hannes@cmpxchg.org> + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include <linux/sched/loadavg.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/seqlock.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/psi.h> +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group *group; + void *iter = NULL; + + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,9 +54,9 @@ #include <linux/proc_fs.h> #include <linux/prefetch.h> #include <linux/profile.h> +#include <linux/psi.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> -#include <linux/stackprotector.h> #include <linux/stop_machine.h> #include <linux/suspend.h> #include <linux/swait.h> @@ -320,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include <linux/cgroup.h> +#include <linux/psi.h> struct cfs_rq; struct rt_rq; @@ -346,6 +347,8 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; + + bool distribute_running; #endif }; @@ -715,8 +718,12 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; - /* Indicate more than one runnable task for any CPU */ - bool overload; + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -783,6 +790,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; @@ -842,6 +850,8 @@ struct rq { unsigned char idle_balance; + unsigned long misfit_task_load; + /* For active balancing */ int active_balance; int push_cpu; @@ -855,8 +865,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -#define HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -950,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1068,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1185,6 +1288,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { atomic_t ref; @@ -1194,6 +1298,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ @@ -1393,7 +1498,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = 0; #undef SCHED_FEAT -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ @@ -1523,7 +1628,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -1693,8 +1798,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP - if (!rq->rd->overload) - rq->rd->overload = true; + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); #endif } @@ -1708,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1774,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -2214,7 +2237,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 56a0fed30c0a..9d74371e4aad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -7,8 +7,8 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ -cpumask_var_t sched_domains_tmpmask; -cpumask_var_t sched_domains_tmpmask2; +static cpumask_var_t sched_domains_tmpmask; +static cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { @@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } @@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= ~TOPOLOGY_SD_FLAGS; + /* Apply detected topology flags */ + sd_flags |= dflags; + *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across CPUs of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; @@ -1295,7 +1303,7 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (sched_domains_numa_levels <= 1) { + if (sched_domains_numa_levels <= 2) { sched_numa_topology_type = NUMA_DIRECT; return; } @@ -1380,9 +1388,6 @@ void sched_init_numa(void) break; } - if (!level) - return; - /* * 'level' contains the number of unique distances * @@ -1607,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); if (child) { sd->level = child->level + 1; @@ -1636,6 +1641,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve } /* + * Find the sched_domain_topology_level where all CPU capacities are visible + * for all CPUs. + */ +static struct sched_domain_topology_level +*asym_cpu_capacity_level(const struct cpumask *cpu_map) +{ + int i, j, asym_level = 0; + bool asym = false; + struct sched_domain_topology_level *tl, *asym_tl = NULL; + unsigned long cap; + + /* Is there any asymmetry? */ + cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + + for_each_cpu(i, cpu_map) { + if (arch_scale_cpu_capacity(NULL, i) != cap) { + asym = true; + break; + } + } + + if (!asym) + return NULL; + + /* + * Examine topology from all CPU's point of views to detect the lowest + * sched_domain_topology_level where a highest capacity CPU is visible + * to everyone. + */ + for_each_cpu(i, cpu_map) { + unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + int tl_id = 0; + + for_each_sd_topology(tl) { + if (tl_id < asym_level) + goto next_level; + + for_each_cpu_and(j, tl->mask(i), cpu_map) { + unsigned long capacity; + + capacity = arch_scale_cpu_capacity(NULL, j); + + if (capacity <= max_capacity) + continue; + + max_capacity = capacity; + asym_level = tl_id; + asym_tl = tl; + } +next_level: + tl_id++; + } + } + + return asym_tl; +} + + +/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -1647,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; + struct sched_domain_topology_level *tl_asym; + bool has_asym = false; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; + tl_asym = asym_cpu_capacity_level(cpu_map); + /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); + int dflags = 0; + + if (tl == tl_asym) { + dflags |= SD_ASYM_CPUCAPACITY; + has_asym = true; + } + + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) @@ -1707,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } rcu_read_unlock(); + if (has_asym) + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); |