diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 2042 |
1 files changed, 1203 insertions, 839 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu) } /* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 + * The margin used when comparing utilization with CPU capacity. * * (default: ~20%) */ -static unsigned int capacity_margin = 1280; +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight } } - /* hint to use a 32x32->64 mul */ - fact = (u64)(u32)fact * lw->inv_weight; + fact = mul_u32_u32(fact, lw->inv_weight); while (fact >> 32) { fact >>= 1; @@ -749,7 +748,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -803,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -1188,47 +1186,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - RCU_INIT_POINTER(p->numa_group, NULL); - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = NUMA_NO_NODE; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -1516,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1644,7 +1606,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; @@ -2523,7 +2485,7 @@ static void reset_ptenuma_scan(struct task_struct *p) * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; @@ -2536,7 +2498,7 @@ void task_numa_work(struct callback_head *work) SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2665,6 +2627,50 @@ out: } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + init_task_work(&p->numa_work, task_numa_work); + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ @@ -2693,10 +2699,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + if (!time_before(jiffies, curr->mm->numa_next_scan)) task_work_add(curr, work, true); - } } } @@ -3110,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3362,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3504,9 +3509,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3514,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3555,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3613,11 +3614,15 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq, 0); + } } #ifndef CONFIG_64BIT @@ -3689,8 +3694,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -3708,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3766,10 +3783,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; @@ -3802,12 +3830,13 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: WRITE_ONCE(p->se.avg.util_est, ue); } static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return capacity * 1024 > task_util_est(p) * capacity_margin; + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -3842,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -4355,23 +4384,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4379,22 +4401,12 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4411,65 +4423,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4522,7 +4492,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -4554,7 +4523,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4565,6 +4534,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4574,6 +4544,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4613,7 +4584,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4633,6 +4604,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) enqueue = 0; @@ -4641,6 +4613,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; if (cfs_rq_throttled(cfs_rq)) break; @@ -4656,8 +4629,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4673,13 +4645,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4704,7 +4678,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4732,8 +4706,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4746,8 +4718,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; @@ -4829,8 +4800,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4863,7 +4833,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); unsigned long flags; - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); @@ -4881,7 +4850,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4890,11 +4858,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); - if (expires == cfs_b->runtime_expires) - lsub_positive(&cfs_b->runtime, runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -4990,20 +4957,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -5042,17 +5017,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -5130,11 +5101,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5230,7 +5196,7 @@ static inline unsigned long cpu_util(int cpu); static inline bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -5244,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +#ifdef CONFIG_SMP +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5254,6 +5234,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5286,6 +5267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } @@ -5293,6 +5275,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5354,6 +5337,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5368,6 +5353,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5387,6 +5373,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5398,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5420,26 +5411,45 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -static unsigned long cpu_runnable_load(struct rq *rq) +static unsigned long cpu_load(struct rq *rq) { - return cfs_rq_runnable_load_avg(&rq->cfs); + return cfs_rq_load_avg(&rq->cfs); } -static unsigned long capacity_of(int cpu) +/* + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted + * + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. + */ +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) { - return cpu_rq(cpu)->cpu_capacity; -} + struct cfs_rq *cfs_rq; + unsigned int load; -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); - if (nr_running) - return load_avg / nr_running; + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); - return 0; + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; +} + +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; } static void record_wakee(struct task_struct *p) @@ -5532,7 +5542,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5550,7 +5560,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5588,149 +5598,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - avg_load = 0; - runnable_load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; - this_spare = max_spare_cap; - } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. - */ - if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) - return NULL; - - if (min_runnable_load > (this_runnable_load + imbalance)) - return NULL; - - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5751,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5774,7 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -5794,7 +5667,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -5943,7 +5816,7 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -5971,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, nr = INT_MAX; int this = smp_processor_id(); + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6002,12 +5876,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6027,21 +5901,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (available_idle_cpu(target)) + if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) + return prev; + + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { return prev; + } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - available_idle_cpu(recent_used_cpu) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6277,69 +6166,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - unsigned int max_util, util_cfs, cpu_util, cpu_cap; - unsigned long sum_util, energy = 0; - struct task_struct *tsk; + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; int cpu; - for (; pd; pd = pd->next) { - struct cpumask *pd_mask = perf_domain_span(pd); + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); + struct task_struct *tsk = cpu == dst_cpu ? p : NULL; /* - * The energy model mandates all the CPUs of a performance - * domain have the same capacity. + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. */ - cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - max_util = sum_util = 0; + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); /* - * The capacity state of CPUs of the current rd can be driven by - * CPUs of another rd if they belong to the same performance - * domain. So, account for the utilization of these CPUs too - * by masking pd with cpu_online_mask instead of the rd span. - * - * If an entire performance domain is outside of the current rd, - * it will not appear in its pd list and will not be accounted - * by compute_energy(). + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - util_cfs = cpu_util_next(cpu, p, dst_cpu); - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - tsk = cpu == dst_cpu ? p : NULL; - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - energy += em_pd_energy(pd->em_pd, max_util, sum_util); + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return energy; + return em_pd_energy(pd->em_pd, max_util, sum_util); } /* @@ -6381,21 +6256,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ - static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { - unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + unsigned long cpu_cap, util, base_energy = 0; int cpu, best_energy_cpu = prev_cpu; - struct perf_domain *head, *pd; - unsigned long cpu_cap, util; struct sched_domain *sd; + struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; - head = pd; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6412,31 +6285,44 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) goto unlock; for (; pd; pd = pd->next) { - unsigned long cur_energy, spare_cap, max_spare_cap = 0; + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long base_energy_pd; int max_spare_cap_cpu = -1; + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - if (cpu_cap * 1024 < util * capacity_margin) + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); + if (!fits_capacity(util, cpu_cap)) continue; /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_energy = compute_energy(p, prev_cpu, head); - best_energy = min(best_energy, prev_energy); - continue; + prev_delta = compute_energy(p, prev_cpu, pd); + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); } /* * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; @@ -6444,10 +6330,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { - cur_energy = compute_energy(p, max_spare_cap_cpu, head); - if (cur_energy < best_energy) { - best_energy = cur_energy; + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } @@ -6459,10 +6346,10 @@ unlock: * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_energy == ULONG_MAX) + if (prev_delta == ULONG_MAX) return best_energy_cpu; - if ((prev_energy - best_energy) > (prev_energy >> 4)) + if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) return best_energy_cpu; return prev_cpu; @@ -6616,6 +6503,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6783,7 +6679,7 @@ preempt: set_last_buddy(se); } -static struct task_struct * +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -6792,11 +6688,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6873,8 +6769,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6902,11 +6798,13 @@ done: __maybe_unused; return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + if (!rf) + return NULL; + + new_tasks = newidle_balance(rq, rf); /* - * Because idle_balance() releases (and re-acquires) rq->lock, it is + * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -6925,6 +6823,11 @@ idle: return NULL; } +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); +} + /* * Account for a descheduled task: */ @@ -7114,11 +7017,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * 'group_type' describes the group of CPUs at the moment of load balancing. + * + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the group_type can simply be compared when selecting the busiest + * group. See update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + /* The group has spare capacity that can be used to run more tasks. */ + group_has_spare = 0, + /* + * The group is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + group_fully_busy, + /* + * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity + * and must be migrated to a more powerful CPU. + */ group_misfit_task, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the + * current CPU. + */ + group_asym_packing, + /* + * The tasks' affinity constraints previously prevented the scheduler + * from balancing the load across the system. + */ group_imbalanced, - group_overloaded, + /* + * The CPU is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7151,7 +7092,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; }; @@ -7374,7 +7315,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance runnable load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7382,8 +7323,8 @@ static const unsigned int sched_nr_migrate_break = 32; static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -7416,21 +7357,55 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + switch (env->migration_type) { + case migrate_load: + load = task_h_load(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if ((load / 2) > env->imbalance) - goto next; + /* + * Make sure that we don't migrate too much load. + * Nevertheless, let relax the constraint if + * scheduler fails to find a good waiting task to + * migrate. + */ + if (load/2 > env->imbalance && + env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + goto next; + + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + /* This is not a misfit task */ + if (task_fits_capacity(p, capacity_of(env->src_cpu))) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize @@ -7442,7 +7417,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * runnable load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -7552,6 +7527,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7571,16 +7568,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } -static void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; - const struct sched_class *curr_class; - struct rq_flags rf; - bool done = true; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); + bool decayed = false; + int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -7589,9 +7581,13 @@ static void update_blocked_averages(int cpu) for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq, 0); + if (cfs_rq == &rq->cfs) + decayed = true; + } + /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) @@ -7606,19 +7602,10 @@ static void update_blocked_averages(int cpu) /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) - done = false; + *done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - - update_blocked_load_status(rq, !done); - rq_unlock_irqrestore(rq, &rf); + return decayed; } /* @@ -7668,23 +7655,16 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else -static inline void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - const struct sched_class *curr_class; - struct rq_flags rf; + bool decayed; - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); - rq_unlock_irqrestore(rq, &rf); + return decayed; } static unsigned long task_h_load(struct task_struct *p) @@ -7693,6 +7673,24 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} + /********** Helpers for find_busiest_group ************************/ /* @@ -7701,14 +7699,14 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -7723,10 +7721,10 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -7737,19 +7735,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } @@ -7820,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); - - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } + unsigned long cpu_cap = capacity_of(cpu); - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* @@ -7937,13 +7916,13 @@ static inline int sg_imbalanced(struct sched_group *group) * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7958,13 +7937,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7977,8 +7956,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->min_capacity * capacity_margin < - ref->sgc->min_capacity * 1024; + return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); } /* @@ -7988,24 +7966,30 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity * capacity_margin < - ref->sgc->max_capacity * 1024; + return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(unsigned int imbalance_pct, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(imbalance_pct, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -8042,21 +8026,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_runnable_load(rq); + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8070,9 +8058,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + + if (local_group) + continue; + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -8080,17 +8075,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; + sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -8113,6 +8115,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -8121,7 +8127,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -8130,62 +8136,92 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) - return false; - - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - */ - if (sgs->sum_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - if (!sds->busiest) - return true; + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; - /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus > busiest->idle_cpus) + return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -8210,18 +8246,314 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * task_running_on_cpu - return 1 if @p is running on @cpu. + */ + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return 0; + + if (task_on_rq_queued(p)) + return 1; + + return 0; +} + +/** + * idle_cpu_without - would a given CPU be idle without p ? + * @cpu: the processor on which idleness is tested. + * @p: task which should be ignored. + * + * Return: 1 if the CPU would be idle. 0 otherwise. + */ +static int idle_cpu_without(int cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle && rq->curr != p) + return 0; + + /* + * rq->nr_running can't be used but an updated version without the + * impact of p on cpu must be used instead. The updated nr_running + * be computed and tested before calling idle_cpu_without(). + */ + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; +} + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @sd: The sched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + * @p: The task for which we look for the idlest group/CPU. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + unsigned int local; + + sgs->group_load += cpu_load_without(rq, p); + sgs->group_util += cpu_util_without(i, p); + local = task_running_on_cpu(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + + nr_running = rq->nr_running - local; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu_without() if nr_running is not 0 + */ + if (!nr_running && idle_cpu_without(i, p)) + sgs->idle_cpus++; + + } + + /* Check if task fits in the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + !task_fits_capacity(p, group->sgc->max_capacity)) { + sgs->group_misfit_task_load = 1; + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type < group_fully_busy) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + return false; + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int sd_flag) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* The local group has been skipped because of CPU affinity */ + if (!local) + return idlest; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -8248,22 +8580,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -8272,13 +8588,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { @@ -8309,203 +8627,177 @@ next_group: } /** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) -{ - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = sds->busiest_stat.group_load; - - return 1; -} - -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @env: load balance environment + * @sds: statistics of the sched_domain whose imbalance is to be calculated. */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; - if (!local->sum_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + return; + } - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. + */ + env->migration_type = migrate_task; + env->imbalance = 1; return; } /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. + * Try to use spare capacity of local group without overloading it or + * emptying busiest. */ + if (local->group_type == group_has_spare) { + if (busiest->group_type > group_fully_busy) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } + return; + } - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_nr_running); + env->imbalance = nr_diff >> 1; + } else { - /* Move if we gain throughput */ - if (capa_move > capa_now) - env->imbalance = busiest->load_per_task; -} + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); + } -/** - * calculate_imbalance - Calculate the amount of imbalance present within the - * groups of a given sched_domain during load balance. - * @env: load balance environment - * @sds: statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long max_pull, load_above_capacity = ~0UL; - struct sg_lb_stats *local, *busiest; + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; - local = &sds->local_stat; - busiest = &sds->busiest_stat; + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } - if (busiest->group_type == group_imbalanced) { - /* - * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX - */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); + return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; - return fix_small_imbalance(env, sds); - } + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ - /* - * If there aren't any idle CPUs, avoid creating some. - */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ + /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -8525,7 +8817,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); @@ -8540,17 +8832,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't @@ -8561,55 +8853,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - - /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; + if (local->group_type == group_overloaded) { + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) + goto out_balanced; + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; - if (env->idle == CPU_IDLE) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * Don't pull any tasks if this group is already above the + * domain average load. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= sds.avg_load) goto out_balanced; - } else { + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8625,11 +8942,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -8657,20 +8976,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8680,35 +8987,69 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - load = cpu_runnable_load(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. + */ + load = cpu_load(rq); - /* - * When comparing with imbalance, use cpu_runnable_load() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; + + case migrate_util: + util = cpu_util(cpu_of(rq)); + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; - /* - * For the load comparisons with the other CPU's, consider - * the cpu_runnable_load() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: load_i * capacity_j > load_j * capacity_i; where j is - * our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; } } @@ -8754,7 +9095,7 @@ voluntary_active_balance(struct lb_env *env) return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; return 0; @@ -9047,9 +9388,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) @@ -9070,10 +9412,10 @@ out_one_pinned: ld_moved = 0; /* - * idle_balance() disregards balance intervals, so we could repeatedly - * reach this code, which would lead to balance_interval skyrocketting - * in a short amount of time. Skip the balance_interval increase logic - * to avoid that. + * newidle_balance() disregards balance intervals, so we could + * repeatedly reach this code, which would lead to balance_interval + * skyrocketting in a short amount of time. Skip the balance_interval + * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; @@ -9227,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9263,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9279,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); @@ -9782,8 +10126,13 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. + * + * Returns: + * < 0 - we released the lock and there are !fair tasks present + * 0 - failed, no new tasks + * > 0 - success, new (fair) tasks present */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -9791,6 +10140,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10025,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -10115,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } @@ -10175,9 +10528,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10295,18 +10658,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } @@ -10447,10 +10810,12 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, - .pick_next_task = pick_next_task_fair, + .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -10461,7 +10826,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, |