summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c573
1 files changed, 456 insertions, 117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..dbd4490afec1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
int local_group;
int i;
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ goto nextgroup;
+
local_group = cpu_isset(this_cpu, group->cpumask);
- /* XXX: put a cpus allowed check */
/* Tally up the load of all CPUs in the group */
avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
min_load = avg_load;
idlest = group;
}
+nextgroup:
group = group->next;
} while (group != sd->groups);
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/*
* find_idlest_queue - find the idlest runqueue among the cpus in group.
*/
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
+ cpumask_t tmp;
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
- for_each_cpu_mask(i, group->cpumask) {
+ /* Traverse only the allowed CPUs */
+ cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+ for_each_cpu_mask(i, tmp) {
load = source_load(i, 0);
if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
if (!group)
goto nextlevel;
- new_cpu = find_idlest_cpu(group, cpu);
+ new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu)
goto nextlevel;
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
@@ -1252,6 +1261,16 @@ out_activate:
}
/*
+ * Tasks that have marked their sleep as noninteractive get
+ * woken up without updating their sleep average. (i.e. their
+ * sleep is handled in a priority-neutral manner, no priority
+ * boost and no penalty.)
+ */
+ if (old_state & TASK_NONINTERACTIVE)
+ __activate_task(p, rq);
+ else
+ activate_task(p, rq, cpu == this_cpu);
+ /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1278,6 @@ out_activate:
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
- activate_task(p, rq, cpu == this_cpu);
if (!sync || cpu != this_cpu) {
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
return success;
}
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
{
unsigned long flags;
int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
/**
* finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
/*
* We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
int load_idx;
max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ if (*sd_idle && !idle_cpu(i))
+ *sd_idle = 0;
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load)
+ if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+
+ /* Don't want to pull so many tasks that a group would go idle */
+ max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
/* How much load to actually move to equalise the imbalance */
- *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+ *imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
unsigned long imbalance;
int nr_moved, all_pinned = 0;
int active_balance = 0;
+ int sd_idle = 0;
+
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
- spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- double_lock_balance(this_rq, busiest);
+ double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle,
- &all_pinned);
- spin_unlock(&busiest->lock);
+ imbalance, sd, idle, &all_pinned);
+ double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned))
goto out_balanced;
}
- spin_unlock(&this_rq->lock);
-
if (!nr_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock(&busiest->lock);
+
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock(&busiest->lock);
+ all_pinned = 1;
+ goto out_one_pinned;
+ }
+
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->balance_interval *= 2;
}
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return nr_moved;
out_balanced:
- spin_unlock(&this_rq->lock);
-
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
+
+out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return 0;
}
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
runqueue_t *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
+ int sd_idle = 0;
+
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
BUG_ON(busiest == this_rq);
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
-
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE, NULL);
- if (!nr_moved)
+ spin_unlock(&busiest->lock);
+ }
+
+ if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- else
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
+ } else
sd->nr_balance_failed = 0;
- spin_unlock(&busiest->lock);
return nr_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
sd->nr_balance_failed = 0;
return 0;
}
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
if (j - sd->last_balance >= interval) {
if (load_balance(this_cpu, this_rq, sd, idle)) {
- /* We've pulled tasks over so no longer idle */
+ /*
+ * We've pulled tasks over so either we're no
+ * longer idle, or one of our SMT siblings is
+ * not idle.
+ */
idle = NOT_IDLE;
}
sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
}
#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+ if (rq->curr == rq->idle && rq->nr_running)
+ resched_task(rq->idle);
+}
+
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
for_each_cpu_mask(i, sibling_map) {
runqueue_t *smt_rq = cpu_rq(i);
- /*
- * If an SMT sibling task is sleeping due to priority
- * reasons wake it up now.
- */
- if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
- resched_task(smt_rq->idle);
+ wakeup_busy_runqueue(smt_rq);
}
for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
*/
}
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
runqueue_t *smt_rq = cpu_rq(i);
task_t *smt_curr = smt_rq->curr;
+ /* Kernel threads do not participate in dependent sleeping */
+ if (!p->mm || !smt_curr->mm || rt_task(p))
+ goto check_smt_task;
+
/*
* If a user task with lower static priority than the
* running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(p) || rt_task(smt_curr)) &&
- p->mm && smt_curr->mm && !rt_task(p))
- ret = 1;
+ if (rt_task(smt_curr)) {
+ /*
+ * With real time tasks we run non-rt tasks only
+ * per_cpu_gain% of the time.
+ */
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ ret = 1;
+ } else
+ if (smt_curr->static_prio < p->static_prio &&
+ !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(smt_curr, sd) > task_timeslice(p))
+ ret = 1;
+
+check_smt_task:
+ if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+ rt_task(smt_curr))
+ continue;
+ if (!p->mm) {
+ wakeup_busy_runqueue(smt_rq);
+ continue;
+ }
/*
- * Reschedule a lower priority task on the SMT sibling,
- * or wake it up if it has been put to sleep for priority
- * reasons.
+ * Reschedule a lower priority task on the SMT sibling for
+ * it to be put to sleep, or wake it up if it has been put to
+ * sleep for priority reasons to see if it should run now.
*/
- if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(smt_curr) || rt_task(p)) &&
- smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
- (smt_curr == smt_rq->idle && smt_rq->nr_running))
- resched_task(smt_curr);
+ if (rt_task(p)) {
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ resched_task(smt_curr);
+ } else {
+ if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(p, sd) > task_timeslice(smt_curr))
+ resched_task(smt_curr);
+ else
+ wakeup_busy_runqueue(smt_rq);
+ }
}
out_unlock:
for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
+ prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
@@ -3014,7 +3117,8 @@ need_resched:
#endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
{
task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+ int nr_exclusive, void *key)
{
unsigned long flags;
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
*
* On UP it can prevent extra preemption.
*/
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @policy: new policy.
* @param: structure containing the new RT priority.
*/
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
{
int retval;
int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,7 @@ recheck:
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/
if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3688,8 @@ recheck:
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
int retval;
struct sched_param lparam;
@@ -3848,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
if (rt_task(current))
target = rq->active;
- if (current->array->nr_active == 1) {
+ if (array->nr_active == 1) {
schedstat_inc(rq, yld_act_empty);
if (!rq->expired->nr_active)
schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched);
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
{
int ret = 0;
@@ -4095,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t * p)
+static void show_task(task_t *p)
{
task_t *relative;
unsigned state;
@@ -4121,7 +4229,7 @@ static void show_task(task_t * p)
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
- unsigned long * n = (unsigned long *) (p->thread_info+1);
+ unsigned long *n = (unsigned long *) (p->thread_info+1);
while (!*n)
n++;
free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4438,7 @@ out:
* thread migration by bumping thread off CPU then 'pushing' onto
* another runqueue.
*/
-static int migration_thread(void * data)
+static int migration_thread(void *data)
{
runqueue_t *rq;
int cpu = (long)data;
@@ -4779,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
runqueue_t *rq = cpu_rq(cpu);
struct sched_domain *tmp;
@@ -4802,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
}
/* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-void init_sched_build_groups(struct sched_group groups[],
- cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+ int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[],
last->next = first;
}
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Start at @node */
+ n = (node + i) % MAX_NUMNODES;
+
+ if (!nr_cpus_node(n))
+ continue;
+
+ /* Skip already used nodes */
+ if (test_bit(n, used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ set_bit(best_node, used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+ int i;
+ cpumask_t span, nodemask;
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+ cpus_clear(span);
+ bitmap_zero(used_nodes, MAX_NUMNODES);
+
+ nodemask = node_to_cpumask(node);
+ cpus_or(span, span, nodemask);
+ set_bit(node, used_nodes);
+
+ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+ int next_node = find_next_best_node(node, used_nodes);
+ nodemask = node_to_cpumask(next_node);
+ cpus_or(span, span, nodemask);
+ }
+
+ return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5072,20 @@ static int cpu_to_phys_group(int cpu)
}
#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
- return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
/*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
*/
-static void check_sibling_maps(void)
-{
- int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
- for_each_online_cpu(i) {
- for_each_cpu_mask(j, cpu_sibling_map[i]) {
- if (cpu_to_node(i) != cpu_to_node(j)) {
- printk(KERN_INFO "warning: CPU %d siblings map "
- "to different node - isolating "
- "them.\n", i);
- cpu_sibling_map[i] = cpumask_of_cpu(i);
- break;
- }
- }
- }
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+ return cpu_to_node(cpu);
}
#endif
@@ -4928,9 +5093,24 @@ static void check_sibling_maps(void)
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+#ifdef CONFIG_NUMA
+ struct sched_group **sched_group_nodes = NULL;
+ struct sched_group *sched_group_allnodes = NULL;
+
+ /*
+ * Allocate the per-node list of sched groups
+ */
+ sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_ATOMIC);
+ if (!sched_group_nodes) {
+ printk(KERN_WARNING "Can not alloc sched group node list\n");
+ return;
+ }
+ sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
/*
* Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
cpus_and(nodemask, nodemask, *cpu_map);
#ifdef CONFIG_NUMA
+ if (cpus_weight(*cpu_map)
+ > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ if (!sched_group_allnodes) {
+ sched_group_allnodes
+ = kmalloc(sizeof(struct sched_group)
+ * MAX_NUMNODES,
+ GFP_KERNEL);
+ if (!sched_group_allnodes) {
+ printk(KERN_WARNING
+ "Can not alloc allnodes sched group\n");
+ break;
+ }
+ sched_group_allnodes_bycpu[i]
+ = sched_group_allnodes;
+ }
+ sd = &per_cpu(allnodes_domains, i);
+ *sd = SD_ALLNODES_INIT;
+ sd->span = *cpu_map;
+ group = cpu_to_allnodes_group(i);
+ sd->groups = &sched_group_allnodes[group];
+ p = sd;
+ } else
+ p = NULL;
+
sd = &per_cpu(node_domains, i);
- group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- sd->span = *cpu_map;
- sd->groups = &sched_group_nodes[group];
+ sd->span = sched_domain_node_span(cpu_to_node(i));
+ sd->parent = p;
+ cpus_and(sd->span, sd->span, *cpu_map);
#endif
p = sd;
@@ -4972,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_NUMA
/* Set up node groups */
- init_sched_build_groups(sched_group_nodes, *cpu_map,
- &cpu_to_node_group);
+ if (sched_group_allnodes)
+ init_sched_build_groups(sched_group_allnodes, *cpu_map,
+ &cpu_to_allnodes_group);
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Set up node groups */
+ struct sched_group *sg, *prev;
+ cpumask_t nodemask = node_to_cpumask(i);
+ cpumask_t domainspan;
+ cpumask_t covered = CPU_MASK_NONE;
+ int j;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask)) {
+ sched_group_nodes[i] = NULL;
+ continue;
+ }
+
+ domainspan = sched_domain_node_span(i);
+ cpus_and(domainspan, domainspan, *cpu_map);
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sched_group_nodes[i] = sg;
+ for_each_cpu_mask(j, nodemask) {
+ struct sched_domain *sd;
+ sd = &per_cpu(node_domains, j);
+ sd->groups = sg;
+ if (sd->groups == NULL) {
+ /* Turn off balancing if we have no groups */
+ sd->flags = 0;
+ }
+ }
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", i);
+ continue;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = nodemask;
+ cpus_or(covered, covered, nodemask);
+ prev = sg;
+
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ cpumask_t tmp, notcovered;
+ int n = (i + j) % MAX_NUMNODES;
+
+ cpus_complement(notcovered, covered);
+ cpus_and(tmp, notcovered, *cpu_map);
+ cpus_and(tmp, tmp, domainspan);
+ if (cpus_empty(tmp))
+ break;
+
+ nodemask = node_to_cpumask(n);
+ cpus_and(tmp, tmp, nodemask);
+ if (cpus_empty(tmp))
+ continue;
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ break;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = tmp;
+ cpus_or(covered, covered, tmp);
+ prev->next = sg;
+ prev = sg;
+ }
+ prev->next = sched_group_nodes[i];
+ }
#endif
/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
sd->groups->cpu_power = power;
#ifdef CONFIG_NUMA
- if (i == first_cpu(sd->groups->cpumask)) {
- /* Only add "power" once for each physical package. */
- sd = &per_cpu(node_domains, i);
- sd->groups->cpu_power += power;
+ sd = &per_cpu(allnodes_domains, i);
+ if (sd->groups) {
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+ sd->groups->cpu_power = power;
}
#endif
}
+#ifdef CONFIG_NUMA
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ struct sched_group *sg = sched_group_nodes[i];
+ int j;
+
+ if (sg == NULL)
+ continue;
+next_sg:
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;
+ int power;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+
+ sg->cpu_power += power;
+ }
+ sg = sg->next;
+ if (sg != sched_group_nodes[i])
+ goto next_sg;
+ }
+#endif
+
/* Attach the domains */
for_each_cpu_mask(i, *cpu_map) {
struct sched_domain *sd;
@@ -5039,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
*/
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
{
cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- check_sibling_maps();
-#endif
/*
* Setup mask for cpus without special case scheduling requirements.
* For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
- /* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+ int i;
+ int cpu;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+#endif
+}
/*
* Detach sched domains from a group of cpus specified in cpu_map
OpenPOWER on IntegriCloud