From 10b777246c6953100099af1870d35c8b24d49b12 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Nov 2007 22:39:37 +0100
Subject: sched: fix vslice

vslice was missing a factor NICE_0_LOAD, as weight is in
weight*NICE_0_LOAD units.

the effect of this bug was larger initial slices and
thus latency-noisier forks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f662ab7..62b057603f07 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -259,6 +259,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
 {
 	u64 vslice = __sched_period(nr_running);
 
+	vslice *= NICE_0_LOAD;
 	do_div(vslice, rq_weight);
 
 	return vslice;
-- 
cgit v1.2.1


From 2cb8600e6be4281e381d39e44de4359e46333e23 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Nov 2007 22:39:37 +0100
Subject: sched: documentation: place_entity() comments

Add a few comments to place_entity(). No code changed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 62b057603f07..8763bee6b661 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -473,19 +473,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
 		vruntime += sched_vslice(cfs_rq)/2;
 
+	/*
+	 * The 'current' period is already promised to the current tasks,
+	 * however the extra weight of the new task will slow them down a
+	 * little, place the new task so that it fits in the slot that
+	 * stays open at the end.
+	 */
 	if (initial && sched_feat(START_DEBIT))
 		vruntime += sched_vslice_add(cfs_rq, se);
 
 	if (!initial) {
+		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
 				task_of(se)->policy != SCHED_BATCH)
 			vruntime -= sysctl_sched_latency;
 
-		vruntime = max_t(s64, vruntime, se->vruntime);
+		/* ensure we never gain time by being placed backwards. */
+		vruntime = max_vruntime(se->vruntime, vruntime);
 	}
 
 	se->vruntime = vruntime;
-
 }
 
 static void
-- 
cgit v1.2.1


From b2be5e96dc0b5a179cf4cb98e65cfb605752ca26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Nov 2007 22:39:37 +0100
Subject: sched: reintroduce the sched_min_granularity tunable

we lost the sched_min_granularity tunable to a clever optimization
that uses the sched_latency/min_granularity ratio - but the ratio
is quite unintuitive to users and can also crash the kernel if the
ratio is set to 0. So reintroduce the min_granularity tunable,
while keeping the ratio maintained internally.

no functionality changed.

[ mingo@elte.hu: some fixlets. ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  6 +++++-
 kernel/sched_debug.c  |  2 +-
 kernel/sched_fair.c   | 35 ++++++++++++++++++++++++++++-------
 kernel/sysctl.c       | 11 +++++++----
 4 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 155d7438f7ad..5457b6234e11 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1460,12 +1460,16 @@ extern void sched_idle_next(void);
 
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_nr_latency;
+extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
+
+int sched_nr_latency_handler(struct ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length,
+		loff_t *ppos);
 #endif
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 415e5c385542..ca198a797bfa 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
-	PN(sysctl_sched_nr_latency);
+	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
 	PN(sysctl_sched_batch_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8763bee6b661..c495dcf7031b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,16 +35,21 @@
 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
 
 /*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 1 msec, units: nanoseconds)
  */
-const_debug unsigned int sysctl_sched_child_runs_first = 1;
+const_debug unsigned int sysctl_sched_min_granularity = 1000000ULL;
 
 /*
- * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec, units: nanoseconds)
+ * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-const_debug unsigned int sysctl_sched_nr_latency = 20;
+const_debug unsigned int sched_nr_latency = 20;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
 
 /*
  * sys_sched_yield() compat mode
@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
+#ifdef CONFIG_SCHED_DEBUG
+int sched_nr_latency_handler(struct ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+					sysctl_sched_min_granularity);
+
+	return 0;
+}
+#endif
 
 /*
  * The idea is to set a period in which each task runs once.
@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 static u64 __sched_period(unsigned long nr_running)
 {
 	u64 period = sysctl_sched_latency;
-	unsigned long nr_latency = sysctl_sched_nr_latency;
+	unsigned long nr_latency = sched_nr_latency;
 
 	if (unlikely(nr_running > nr_latency)) {
 		period *= nr_running;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b4efbe26445..6e3b63c06856 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -235,11 +235,14 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_nr_latency",
-		.data		= &sysctl_sched_nr_latency,
+		.procname	= "sched_min_granularity_ns",
+		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &sched_nr_latency_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &sched_nr_latency_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
-- 
cgit v1.2.1


From 9a41785cc43d88397f787a651ed7286a33f8462f Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Fri, 9 Nov 2007 22:39:37 +0100
Subject: sched: fix delay accounting regression

Fix the delay accounting regression introduced by commit
75d4ef16a6aa84f708188bada182315f80aab6fa. rq no longer has sched_info
data associated with it. task_struct sched_info structure is used by delay
accounting to provide back statistics to user space.

also remove direct use of sched_clock() (which is not a valid thing to
do anymore) and use rq->clock instead.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_stats.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ef1a7df80ea2..630178e53bb6 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
  */
 static void sched_info_arrive(struct task_struct *t)
 {
-	unsigned long long now = sched_clock(), delta = 0;
+	unsigned long long now = task_rq(t)->clock, delta = 0;
 
 	if (t->sched_info.last_queued)
 		delta = now - t->sched_info.last_queued;
@@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t)
 {
 	if (unlikely(sched_info_on()))
 		if (!t->sched_info.last_queued)
-			t->sched_info.last_queued = sched_clock();
+			t->sched_info.last_queued = task_rq(t)->clock;
 }
 
 /*
@@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-	unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
+	unsigned long long delta = task_rq(t)->clock -
+					t->sched_info.last_arrival;
 
 	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
@@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
-- 
cgit v1.2.1


From fa13a5a1f25f671d084d8884be96fc48d9b68275 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: sched: restore deterministic CPU accounting on powerpc

Since powerpc started using CONFIG_GENERIC_CLOCKEVENTS, the
deterministic CPU accounting (CONFIG_VIRT_CPU_ACCOUNTING) has been
broken on powerpc, because we end up counting user time twice: once in
timer_interrupt() and once in update_process_times().

This fixes the problem by pulling the code in update_process_times
that updates utime and stime into a separate function called
account_process_tick.  If CONFIG_VIRT_CPU_ACCOUNTING is not defined,
there is a version of account_process_tick in kernel/timer.c that
simply accounts a whole tick to either utime or stime as before.  If
CONFIG_VIRT_CPU_ACCOUNTING is defined, then arch code gets to
implement account_process_tick.

This also lets us simplify the s390 code a bit; it means that the s390
timer interrupt can now call update_process_times even when
CONFIG_VIRT_CPU_ACCOUNTING is turned on, and can just implement a
suitable account_process_tick().

account_process_tick() now takes the task_struct * as an argument.
Tested both with and without CONFIG_VIRT_CPU_ACCOUNTING.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/process.c |  2 +-
 arch/powerpc/kernel/time.c    | 25 +------------------------
 arch/s390/kernel/time.c       |  4 ----
 arch/s390/kernel/vtime.c      |  8 +-------
 include/linux/sched.h         |  1 +
 kernel/timer.c                | 21 ++++++++++++++-------
 6 files changed, 18 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b9d88374f14f..41e13f4cc6e3 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -350,7 +350,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	local_irq_save(flags);
 
 	account_system_vtime(current);
-	account_process_vtime(current);
+	account_process_tick(current, 0);
 	calculate_steal_time();
 
 	last = _switch(old_thread, new_thread);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 9eb3284deac4..a70dfb76d0a8 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -259,7 +259,7 @@ void account_system_vtime(struct task_struct *tsk)
  * user and system time records.
  * Must be called with interrupts disabled.
  */
-void account_process_vtime(struct task_struct *tsk)
+void account_process_tick(struct task_struct *tsk, int user_tick)
 {
 	cputime_t utime, utimescaled;
 
@@ -274,18 +274,6 @@ void account_process_vtime(struct task_struct *tsk)
 	account_user_time_scaled(tsk, utimescaled);
 }
 
-static void account_process_time(struct pt_regs *regs)
-{
-	int cpu = smp_processor_id();
-
-	account_process_vtime(current);
-	run_local_timers();
-	if (rcu_pending(cpu))
-		rcu_check_callbacks(cpu, user_mode(regs));
-	scheduler_tick();
- 	run_posix_cpu_timers(current);
-}
-
 /*
  * Stuff for accounting stolen time.
  */
@@ -375,7 +363,6 @@ static void snapshot_purr(void)
 
 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
 #define calc_cputime_factors()
-#define account_process_time(regs)	update_process_times(user_mode(regs))
 #define calculate_steal_time()		do { } while (0)
 #endif
 
@@ -599,16 +586,6 @@ void timer_interrupt(struct pt_regs * regs)
 		get_lppaca()->int_dword.fields.decr_int = 0;
 #endif
 
-	/*
-	 * We cannot disable the decrementer, so in the period
-	 * between this cpu's being marked offline in cpu_online_map
-	 * and calling stop-self, it is taking timer interrupts.
-	 * Avoid calling into the scheduler rebalancing code if this
-	 * is the case.
-	 */
-	if (!cpu_is_offline(cpu))
-		account_process_time(regs);
-
 	if (evt->event_handler)
 		evt->event_handler(evt);
 	else
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a963fe81359e..22b800ce2126 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -145,12 +145,8 @@ void account_ticks(u64 time)
 	do_timer(ticks);
 #endif
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	account_tick_vtime(current);
-#else
 	while (ticks--)
 		update_process_times(user_mode(get_irq_regs()));
-#endif
 
 	s390_do_profile();
 }
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 84ff78de6bac..c5f05b3fb2c3 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
-void account_tick_vtime(struct task_struct *tsk)
+void account_process_tick(struct task_struct *tsk, int user_tick)
 {
 	cputime_t cputime;
 	__u64 timer, clock;
@@ -64,12 +64,6 @@ void account_tick_vtime(struct task_struct *tsk)
 		S390_lowcore.steal_clock -= cputime << 12;
 		account_steal_time(tsk, cputime);
 	}
-
-	run_local_timers();
-	if (rcu_pending(smp_processor_id()))
-		rcu_check_callbacks(smp_processor_id(), rcu_user_flag);
-	scheduler_tick();
- 	run_posix_cpu_timers(tsk);
 }
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5457b6234e11..951759e30c09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,7 @@ long io_schedule_timeout(long timeout);
 
 extern void cpu_init (void);
 extern void trap_init(void);
+extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 00e44e2afd67..a05817c021d6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void)
 
 #endif
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+	if (user_tick) {
+		account_user_time(p, jiffies_to_cputime(1));
+		account_user_time_scaled(p, jiffies_to_cputime(1));
+	} else {
+		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+		account_system_time_scaled(p, jiffies_to_cputime(1));
+	}
+}
+#endif
+
 /*
  * Called from the timer interrupt handler to charge one tick to the current
  * process.  user_tick is 1 if the tick is user time, 0 for system.
@@ -827,13 +840,7 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick) {
-		account_user_time(p, jiffies_to_cputime(1));
-		account_user_time_scaled(p, jiffies_to_cputime(1));
-	} else {
-		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
-		account_system_time_scaled(p, jiffies_to_cputime(1));
-	}
+	account_process_tick(p, user_tick);
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
-- 
cgit v1.2.1


From 19978ca610946ed57c071bad63f8f6642ca1298b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: sched: reintroduce SMP tunings again

Yanmin Zhang reported an aim7 regression and bisected it down to:

 |  commit 38ad464d410dadceda1563f36bdb0be7fe4c8938
 |  Author: Ingo Molnar <mingo@elte.hu>
 |  Date:   Mon Oct 15 17:00:02 2007 +0200
 |
 |     sched: uniform tunings
 |
 |     use the same defaults on both UP and SMP.

fix this by reintroducing similar SMP tunings again. This resolves
the regression.

(also update the comments to match the ilog2(nr_cpus) tuning effect)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 28 ++++++++++++++++++++++++++++
 kernel/sched_fair.c | 18 +++++++++---------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd1112900..69cae271c63b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4992,6 +4992,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+	unsigned int factor = 1 + ilog2(num_online_cpus());
+	const unsigned long limit = 200000000;
+
+	sysctl_sched_min_granularity *= factor;
+	if (sysctl_sched_min_granularity > limit)
+		sysctl_sched_min_granularity = limit;
+
+	sysctl_sched_latency *= factor;
+	if (sysctl_sched_latency > limit)
+		sysctl_sched_latency = limit;
+
+	sysctl_sched_wakeup_granularity *= factor;
+	sysctl_sched_batch_wakeup_granularity *= factor;
+}
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -6688,10 +6714,12 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
+	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
+	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c495dcf7031b..7264814ba62a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,7 +22,7 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 20ms, units: nanoseconds)
+ * (default: 20ms * ilog(ncpus), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -32,18 +32,18 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-const_debug unsigned int sysctl_sched_latency = 20000000ULL;
+unsigned int sysctl_sched_latency = 20000000ULL;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec, units: nanoseconds)
+ * (default: 1 msec * ilog(ncpus), units: nanoseconds)
  */
-const_debug unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 1000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-const_debug unsigned int sched_nr_latency = 20;
+unsigned int sched_nr_latency = 20;
 
 /*
  * After fork, child runs first. (default) If set to 0 then
@@ -61,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_BATCH wake-up granularity.
- * (default: 10 msec, units: nanoseconds)
+ * (default: 10 msec * ilog(ncpus), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 10 msec, units: nanoseconds)
+ * (default: 10 msec * ilog(ncpus), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
-- 
cgit v1.2.1


From d6322faf296ab5bbb597f8b0abcb50153754cd08 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: sched: cleanup, use NSEC_PER_MSEC and NSEC_PER_SEC

1) hardcoded 1000000000 value is used five times in places where
   NSEC_PER_SEC might be more readable.

2) A conversion from nsec to msec uses the hardcoded 1000000 value,
   which is a candidate for NSEC_PER_MSEC.

no code changed:

    text    data     bss     dec     hex filename
   44359    3326      36   47721    ba69 sched.o.before
   44359    3326      36   47721    ba69 sched.o.after

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c  | 8 ++++----
 kernel/sysctl.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 69cae271c63b..387258cdd0b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,7 +75,7 @@
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
-	return (unsigned long long)jiffies * (1000000000 / HZ);
+	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
 
 /*
@@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
-#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (NSEC_PER_SEC / HZ))
 
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
@@ -7256,7 +7256,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
 		spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
 	}
 	/* Convert from ns to ms */
-	do_div(res, 1000000);
+	do_div(res, NSEC_PER_MSEC);
 
 	return res;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6e3b63c06856..adddf682d4cb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -226,9 +226,9 @@ static struct ctl_table root_table[] = {
 
 #ifdef CONFIG_SCHED_DEBUG
 static unsigned long min_sched_granularity_ns = 100000;		/* 100 usecs */
-static unsigned long max_sched_granularity_ns = 1000000000;	/* 1 second */
+static unsigned long max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static unsigned long min_wakeup_granularity_ns;			/* 0 usecs */
-static unsigned long max_wakeup_granularity_ns = 1000000000;	/* 1 second */
+static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 #endif
 
 static struct ctl_table kern_table[] = {
-- 
cgit v1.2.1


From 4e2947f12516d13446d6ffa1d9e4fbd33b1636fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: x86: make ipi_handler() always defined

prepare for up_smp_call_function() to ensure that the 'func'
pointer is unused. (which is related to a KVM build fix)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 9abbdf7562c5..3b20613325dc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -139,13 +139,12 @@ struct set_mtrr_data {
 	mtrr_type	smp_type;
 };
 
-#ifdef CONFIG_SMP
-
 static void ipi_handler(void *info)
 /*  [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
     [RETURNS] Nothing.
 */
 {
+#ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
 	unsigned long flags;
 
@@ -168,9 +167,8 @@ static void ipi_handler(void *info)
 
 	atomic_dec(&data->count);
 	local_irq_restore(flags);
-}
-
 #endif
+}
 
 static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
 	return type1 == MTRR_TYPE_UNCACHABLE ||
-- 
cgit v1.2.1


From 0492007ed9b53f6a2a2f983910d0fe7c97b09822 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: x86: make nmi_cpu_busy() always defined

nmi_cpu_busy() must be available on !SMP too.

this is in preparation to a smp_call_function_mask() fix.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index f803ed0ed1c4..600fd404e440 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
 
 static int endflag __initdata = 0;
 
-#ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
  * CPUs during the test make them busy.
  */
 static __init void nmi_cpu_busy(void *data)
 {
+#ifdef CONFIG_SMP
 	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
 	   care if they get somewhat less cycles. */
 	while (endflag == 0)
 		mb();
-}
 #endif
+}
 
 static int __init check_nmi_watchdog(void)
 {
-- 
cgit v1.2.1


From a5fbb6d1064be885d2a6b82f625186753cf74848 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:38 +0100
Subject: KVM: fix !SMP build error

fix a !SMP build error:

drivers/kvm/kvm_main.c: In function 'kvm_flush_remote_tlbs':
drivers/kvm/kvm_main.c:220: error: implicit declaration of function 'smp_call_function_mask'

(and also avoid unused function warning related to up_smp_call_function()
not making use of the 'func' parameter.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 259a13c3bd98..c25e66bcecf3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -84,11 +84,12 @@ void smp_prepare_boot_cpu(void);
  *	These macros fold the SMP functionality into a single CPU system
  */
 #define raw_smp_processor_id()			0
-static inline int up_smp_call_function(void)
+static inline int up_smp_call_function(void (*func)(void *), void *info)
 {
 	return 0;
 }
-#define smp_call_function(func,info,retry,wait)	(up_smp_call_function())
+#define smp_call_function(func, info, retry, wait) \
+			(up_smp_call_function(func, info))
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
 		local_irq_disable();		\
@@ -107,6 +108,8 @@ static inline void smp_send_reschedule(int cpu) { }
 	local_irq_enable();	\
 	0;			\
 })
+#define smp_call_function_mask(mask, func, info, wait) \
+			(up_smp_call_function(func, info))
 
 #endif /* !SMP */
 
-- 
cgit v1.2.1


From 52d3da1ad4f442cec877fbeb83902707b56da0cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: turn off PREEMPT_RESTRICT

PREEMPT_RESTRICT was a method aimed at reducing the amount of wakeup
related preemption. It has a disadvantage though, it can prevent
legitimate wakeups if a task is 'unlucky' to be hit too early by a tick
that clears peer_preempt.

Now that the wakeup preemption has been cleaned up we dont seem to have
excessive preemptions anymore, so this feature can be turned off. (and
removed in the next patch)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 387258cdd0b9..4b23dfb4c80f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -469,7 +469,7 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_TREE_AVG		* 0 |
 		SCHED_FEAT_APPROX_AVG		* 0 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
-		SCHED_FEAT_PREEMPT_RESTRICT	* 1;
+		SCHED_FEAT_PREEMPT_RESTRICT	* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
-- 
cgit v1.2.1


From 3e3e13f399ac8060a20d14d210a28dc02dda372e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: remove PREEMPT_RESTRICT

remove PREEMPT_RESTRICT. (this is a separate commit so that any
regression related to the removal itself is bisectable)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        |  4 +---
 kernel/sched_fair.c   | 10 ++--------
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 951759e30c09..93fd30d6dac4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -863,7 +863,6 @@ struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
-	int			peer_preempt;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4b23dfb4c80f..2a107e4ad5ed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -460,7 +460,6 @@ enum {
 	SCHED_FEAT_TREE_AVG             = 4,
 	SCHED_FEAT_APPROX_AVG           = 8,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 16,
-	SCHED_FEAT_PREEMPT_RESTRICT	= 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -468,8 +467,7 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
 		SCHED_FEAT_APPROX_AVG		* 0 |
-		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
-		SCHED_FEAT_PREEMPT_RESTRICT	* 0;
+		SCHED_FEAT_WAKEUP_PREEMPT	* 1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7264814ba62a..fbcb426029d0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -546,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		se->peer_preempt = 0;
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -574,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	if (delta_exec > ideal_runtime ||
-			(sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
+	if (delta_exec > ideal_runtime)
 		resched_task(rq_of(cfs_rq)->curr);
-	curr->peer_preempt = 0;
 }
 
 static void
@@ -867,9 +864,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 			gran = calc_delta_fair(gran, &se->load);
 
 		if (delta > gran) {
-			int now = !sched_feat(PREEMPT_RESTRICT);
-
-			if (now || p->prio < curr->prio || !se->peer_preempt++)
+			if (p->prio < curr->prio)
 				resched_task(curr);
 		}
 	}
@@ -1083,7 +1078,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		swap(curr->vruntime, se->vruntime);
 	}
 
-	se->peer_preempt = 0;
 	enqueue_task_fair(rq, p, 0);
 	resched_task(rq->curr);
 }
-- 
cgit v1.2.1


From 8bc6767acb3236e0345e99cf198168e60e7ae456 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: wakeup preemption fix

wakeup preemption fix: do not make it dependent on p->prio.
Preemption purely depends on ->vruntime.

This improves preemption in mixed-nice-level workloads.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fbcb426029d0..a3badf52bba2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -863,10 +863,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		if (unlikely(se->load.weight != NICE_0_LOAD))
 			gran = calc_delta_fair(gran, &se->load);
 
-		if (delta > gran) {
-			if (p->prio < curr->prio)
-				resched_task(curr);
-		}
+		if (delta > gran)
+			resched_task(curr);
 	}
 }
 
-- 
cgit v1.2.1


From 77d9cc44b543fa831169e54c495ad06ef3a0c726 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: clean up the wakeup preempt check

clean up the wakeup preemption check. No code changed:

   text    data     bss     dec     hex filename
  44227    3326      36   47589    b9e5 sched.o.before
  44227    3326      36   47589    b9e5 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a3badf52bba2..d558716a9add 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -852,20 +852,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(p->policy == SCHED_BATCH))
 		return;
 
-	if (sched_feat(WAKEUP_PREEMPT)) {
-		while (!is_same_group(se, pse)) {
-			se = parent_entity(se);
-			pse = parent_entity(pse);
-		}
-
-		delta = se->vruntime - pse->vruntime;
-		gran = sysctl_sched_wakeup_granularity;
-		if (unlikely(se->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, &se->load);
+	if (!sched_feat(WAKEUP_PREEMPT))
+		return;
 
-		if (delta > gran)
-			resched_task(curr);
+	while (!is_same_group(se, pse)) {
+		se = parent_entity(se);
+		pse = parent_entity(pse);
 	}
+
+	delta = se->vruntime - pse->vruntime;
+	gran = sysctl_sched_wakeup_granularity;
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
+
+	if (delta > gran)
+		resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.2.1


From 502d26b524d8980f3ed80d9aec398e85671a8160 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: clean up the wakeup preempt check, #2

clean up the preemption check to not use unnecessary 64-bit
variables. This improves code size:

   text    data     bss     dec     hex filename
  44227    3326      36   47589    b9e5 sched.o.before
  44201    3326      36   47563    b9cb sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d558716a9add..6c361472cc74 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -837,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	s64 delta, gran;
+	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -860,12 +860,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		pse = parent_entity(pse);
 	}
 
-	delta = se->vruntime - pse->vruntime;
 	gran = sysctl_sched_wakeup_granularity;
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		gran = calc_delta_fair(gran, &se->load);
 
-	if (delta > gran)
+	if (pse->vruntime + gran < se->vruntime)
 		resched_task(curr);
 }
 
-- 
cgit v1.2.1


From 3c90e6e99b08f01d5684a3a07cceae6a543e4fa8 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: fix copy_namespace() <-> sched_fork() dependency in do_fork

Sukadev Bhattiprolu reported a kernel crash with control groups.
There are couple of problems discovered by Suka's test:

- The test requires the cgroup filesystem to be mounted with
  atleast the cpu and ns options (i.e both namespace and cpu
  controllers are active in the same hierarchy).

	# mkdir /dev/cpuctl
	# mount -t cgroup -ocpu,ns none cpuctl
	(or simply)
	# mount -t cgroup none cpuctl -> Will activate all controllers
					 in same hierarchy.

- The test invokes clone() with CLONE_NEWNS set. This causes a a new child
  to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone->
  cgroup_clone) and the child is attached to the new group (cgroup_clone->
  attach_task->sched_move_task). At this point in time, the child's scheduler
  related fields are uninitialized (including its on_rq field, which it has
  inherited from parent). As a result sched_move_task thinks its on
  runqueue, when it isn't.

  As a solution to this problem, I moved sched_fork() call, which
  initializes scheduler related fields on a new task, before
  copy_namespaces(). I am not sure though whether moving up will
  cause other side-effects. Do you see any issue?

- The second problem exposed by this test is that task_new_fair()
  assumes that parent and child will be part of the same group (which
  needn't be as this test shows). As a result, cfs_rq->curr can be NULL
  for the child.

  The solution is to test for curr pointer being NULL in
  task_new_fair().

With the patch below, I could run ns_exec() fine w/o a crash.

Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c       | 6 +++---
 kernel/sched_fair.c | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 28a740151988..8ca1a14cdc8c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
 
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
 	if ((retval = security_task_alloc(p)))
 		goto bad_fork_cleanup_policy;
 	if ((retval = audit_alloc(p)))
@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
-	/* Perform scheduler related setup. Assign this task to a CPU. */
-	sched_fork(p, clone_flags);
-
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
 	 * on the tasklist. */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c361472cc74..d3c03070872d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1067,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	update_curr(cfs_rq);
 	place_entity(cfs_rq, se, 1);
 
+	/* 'curr' will be NULL if the child belongs to a different group */
 	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-			curr->vruntime < se->vruntime) {
+			curr && curr->vruntime < se->vruntime) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
 		 * 'current' within the tree based on its new key value.
-- 
cgit v1.2.1


From b82d9fdd848abfbe7263a4ecd9bbb55e575100a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: avoid large irq-latencies in smp-balancing

SMP balancing is done with IRQs disabled and can iterate the full rq.
When rqs are large this can cause large irq-latencies. Limit the nr of
iterations on each run.

This fixes a scheduling latency regression reported by the -rt folks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 15 ++++++++++-----
 kernel/sysctl.c       |  8 ++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 93fd30d6dac4..2cc789fef711 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1466,6 +1466,7 @@ extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index 2a107e4ad5ed..e195a4229418 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -471,6 +471,12 @@ const_debug unsigned int sysctl_sched_features =
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
@@ -2235,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
-	int pulled = 0, pinned = 0, skip_for_load;
+	int loops = 0, pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
@@ -2249,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	 */
 	p = iterator->start(iterator->arg);
 next:
-	if (!p)
+	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 	/*
-	 * To help distribute high priority tasks accross CPUs we don't
+	 * To help distribute high priority tasks across CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
@@ -2269,8 +2275,7 @@ next:
 	rem_load_move -= p->se.load.weight;
 
 	/*
-	 * We only want to steal up to the prescribed number of tasks
-	 * and the prescribed amount of weighted load.
+	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index adddf682d4cb..3a1744fed2b6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -301,6 +301,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_nr_migrate",
+		.data		= &sysctl_sched_nr_migrate,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.1


From e6fe6649b4ec11aa3075e394b4d8743eebe1f64c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 9 Nov 2007 22:39:39 +0100
Subject: sched: proper prototype for kernel/sched.c:migration_init()

This patch adds a proper prototype for migration_init() in
include/linux/sched.h

Since there's no point in always returning 0 to a caller that doesn't check
the return value it also changes the function to return void.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 8 ++++++++
 init/main.c           | 4 +---
 kernel/sched.c        | 4 +---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cc789fef711..ee800e7a70de 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1988,6 +1988,14 @@ static inline void inc_syscw(struct task_struct *tsk)
 }
 #endif
 
+#ifdef CONFIG_SMP
+void migration_init(void);
+#else
+static inline void migration_init(void)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/init/main.c b/init/main.c
index f605a969ea61..80b04b6c5157 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
+#include <linux/sched.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -747,11 +748,8 @@ __setup("nosoftlockup", nosoftlockup_setup);
 static void __init do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
-#ifdef CONFIG_SMP
-	extern int migration_init(void);
 
 	migration_init();
-#endif
 	spawn_ksoftirqd();
 	if (!nosoftlockup)
 		spawn_softlockup_task();
diff --git a/kernel/sched.c b/kernel/sched.c
index e195a4229418..b18f231a4875 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5650,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-int __init migration_init(void)
+void __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -5660,8 +5660,6 @@ int __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
-
-	return 0;
 }
 #endif
 
-- 
cgit v1.2.1