diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 628 |
1 files changed, 345 insertions, 283 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 17de1956ddad..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); +static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -1026,11 +1026,11 @@ struct numa_stats { unsigned long load; /* Total compute capacity of CPUs on a node */ - unsigned long power; + unsigned long compute_capacity; /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; - int has_capacity; + unsigned long task_capacity; + int has_free_capacity; }; /* @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); + ns->compute_capacity += capacity_of(cpu); cpus++; } @@ -1056,15 +1056,15 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. */ if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1095,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ if (dst_load < src_load) swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; @@ -1114,13 +1126,17 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, * The imbalance is above the allowed threshold. * Compare it with the old imbalance. */ + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + if (orig_dst_load < orig_src_load) swap(orig_dst_load, orig_src_load); - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; /* Would this change make things worse? */ - return (old_imb > imb); + return (imb > old_imb); } /* @@ -1135,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1176,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) imp += group_weight(cur, env->src_nid) - group_weight(cur, env->dst_nid); @@ -1190,33 +1201,47 @@ static void task_numa_compare(struct task_numa_env *env, } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) goto unlock; goto balance; } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing power terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; if (cur) { load = task_h_load(cur); @@ -1224,8 +1249,7 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; assign: @@ -1301,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { @@ -1323,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p) } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1335,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an @@ -1414,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of @@ -1594,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = max_group_nid; } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } @@ -2898,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2922,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -3062,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3225,10 +3242,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -3251,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3357,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3407,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3445,7 +3469,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3457,21 +3481,21 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; /* * if we have relooped after returning idle once, we need to update our @@ -3485,28 +3509,23 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3514,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3525,12 +3544,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -3628,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3642,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } @@ -3707,6 +3725,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -3716,6 +3735,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -3770,13 +3790,24 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) +static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -3784,7 +3815,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3828,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3851,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } @@ -4041,9 +4079,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -4065,7 +4103,7 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } @@ -4286,12 +4324,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -4367,8 +4405,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -4720,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -4948,14 +4986,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -5091,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) /* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; @@ -5105,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5115,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5269,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5530,13 +5567,13 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5551,7 +5588,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -5570,7 +5607,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -5605,17 +5642,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -5625,12 +5662,12 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -5650,71 +5687,71 @@ static unsigned long scale_rt_power(int cpu) total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_smt_capacity(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_freq_capacity(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_CAPACITY_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } - power_orig = power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5723,31 +5760,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* - * build_sched_domains() -> init_sched_groups_power() + * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). * - * This avoids power/power_orig from being 0 and + * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. * - * Runtime updates will correct power_orig. + * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); continue; } - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; } } else { /* @@ -5757,14 +5794,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -5778,15 +5815,15 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -5823,34 +5860,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) static inline int sg_imbalanced(struct sched_group *group) { - return group->sgp->imbalance; + return group->sgc->imbalance; } /* - * Compute the group capacity. + * Compute the group capacity factor. * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. + * and limit unit capacity with that. */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - unsigned int capacity, smt, cpus; - unsigned int power, power_orig; + unsigned int capacity_factor, smt, cpus; + unsigned int capacity, capacity_orig; - power = group->sgp->power; - power_orig = group->sgp->power_orig; + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; cpus = group->group_weight; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); + capacity_factor = cpus / smt; /* cores */ - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); - return capacity; + return capacity_factor; } /** @@ -5860,10 +5898,12 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5881,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5890,9 +5934,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -5900,10 +5944,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); - if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + if (sgs->group_capacity_factor > sgs->sum_nr_running) + sgs->group_has_free_capacity = 1; } /** @@ -5927,7 +5971,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -5991,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6007,28 +6052,29 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs = &sds->local_stat; if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); + sds->local_stat.group_has_free_capacity) + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6038,13 +6084,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6085,8 +6138,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, - SCHED_POWER_SCALE); + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, + SCHED_CAPACITY_SCALE); return 1; } @@ -6101,7 +6154,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -6115,8 +6168,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -6126,38 +6179,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { - pwr_move += busiest->group_power * + capa_move += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < - busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + if (busiest->avg_load * busiest->group_capacity < + busiest->load_per_task * SCHED_CAPACITY_SCALE) { + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -6187,7 +6240,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6202,10 +6255,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); + load_above_capacity /= busiest->group_capacity; } /* @@ -6220,9 +6273,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power - ) / SCHED_POWER_SCALE; + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -6276,7 +6329,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6287,8 +6341,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* @@ -6342,11 +6396,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; + unsigned long capacity, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6374,34 +6428,34 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } @@ -6609,7 +6663,7 @@ more_balance: * We failed to reach balance because of affinity. */ if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; + int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { *group_imbalance = 1; @@ -6762,7 +6816,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -6996,7 +7051,7 @@ static inline void set_cpu_sd_state_busy(void) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7013,7 +7068,7 @@ void set_cpu_sd_state_idle(void) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7192,12 +7247,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(rq, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -7212,7 +7272,7 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ @@ -7220,7 +7280,7 @@ static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; if (unlikely(rq->idle_balance)) @@ -7250,8 +7310,8 @@ static inline int nohz_kick_needed(struct rq *rq) sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) goto need_kick_unlock; @@ -7315,6 +7375,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) @@ -7388,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7413,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7476,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } |