From 3c93717cfa51316e4dbb471e7c0f9d243359d5f8 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Thu, 17 Jun 2010 14:08:13 +0800 Subject: sched: Fix over-scheduling bug Commit e70971591 ("sched: Optimize unused cgroup configuration") introduced an imbalanced scheduling bug. If we do not use CGROUP, function update_h_load won't update h_load. When the system has a large number of tasks far more than logical CPU number, the incorrect cfs_rq[cpu]->h_load value will cause load_balance() to pull too many tasks to the local CPU from the busiest CPU. So the busiest CPU keeps going in a round robin. That will hurt performance. The issue was found originally by a scientific calculation workload that developed by Yanmin. With that commit, the workload performance drops about 40%. CPU before after 00 : 2 : 7 01 : 1 : 7 02 : 11 : 6 03 : 12 : 7 04 : 6 : 6 05 : 11 : 7 06 : 10 : 6 07 : 12 : 7 08 : 11 : 6 09 : 12 : 6 10 : 1 : 6 11 : 1 : 6 12 : 6 : 6 13 : 2 : 6 14 : 2 : 6 15 : 1 : 6 Reviewed-by: Yanmin zhang Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra LKML-Reference: <1276754893.9452.5442.camel@debian> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2aaceebd484c..6c9e7c8735bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1657,9 +1657,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.3 From 8695159967957015f8dfb49315d6f88e111d90e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jun 2010 11:44:53 +0200 Subject: sched: silence PROVE_RCU in sched_fork() Because cgroup_fork() is ran before sched_fork() [ from copy_process() ] and the child's pid is not yet visible the child is pinned to its cgroup. Therefore we can silence this warning. A nicer solution would be moving cgroup_fork() to right after dup_task_struct() and exclude PF_STARTING from task_subsys_state(). Signed-off-by: Peter Zijlstra Reviewed-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996228dd..a2d215d132f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2494,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3 From 0d98bb2656e9bd2dfda2d089db1fe1dbdab41504 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 24 May 2010 12:11:43 -0700 Subject: sched: Prevent compiler from optimising the sched_avg_update() loop GCC 4.4.1 on ARM has been observed to replace the while loop in sched_avg_update with a call to uldivmod, resulting in the following build failure at link-time: kernel/built-in.o: In function `sched_avg_update': kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' make: *** [.tmp_vmlinux1] Error 1 This patch introduces a fake data hazard to the loop body to prevent the compiler optimising the loop away. Signed-off-by: Will Deacon Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra Cc: Catalin Marinas Cc: Russell King Cc: Linus Torvalds Cc: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6c9e7c8735bf..a24d6d5d83f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1254,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } -- cgit v1.2.3 From 8c215bd3890c347dfb6a2db4779755f8b9c298a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Jul 2010 09:07:17 +0200 Subject: sched: Cure nr_iowait_cpu() users Commit 0224cf4c5e (sched: Intoduce get_cpu_iowait_time_us()) broke things by not making sure preemption was indeed disabled by the callers of nr_iowait_cpu() which took the iowait value of the current cpu. This resulted in a heap of preempt warnings. Cure this by making nr_iowait_cpu() take a cpu number and fix up the callers to pass in the right number. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Sergey Senozhatsky Cc: Rafael J. Wysocki Cc: Maxim Levitsky Cc: Len Brown Cc: Pavel Machek Cc: Jiri Slaby Cc: linux-pm@lists.linux-foundation.org LKML-Reference: <1277968037.1868.120.camel@laptop> Signed-off-by: Ingo Molnar --- drivers/cpuidle/governors/menu.c | 4 ++-- include/linux/sched.h | 2 +- kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 16 ++++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 52ff8aa63f84..1b128702d300 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -143,7 +143,7 @@ static inline int which_bucket(unsigned int duration) * This allows us to calculate * E(duration)|iowait */ - if (nr_iowait_cpu()) + if (nr_iowait_cpu(smp_processor_id())) bucket = BUCKETS/2; if (duration < 10) @@ -175,7 +175,7 @@ static inline int performance_multiplier(void) mult += 2 * get_loadavg(); /* for IO wait tasks (per cpu!) we add 5x each */ - mult += 10 * nr_iowait_cpu(); + mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; } diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953f..747fcaedddb7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -139,7 +139,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); -extern unsigned long nr_iowait_cpu(void); +extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); diff --git a/kernel/sched.c b/kernel/sched.c index a24d6d5d83f6..f87abe3b0176 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2864,9 +2864,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c034..1a6f828e57a0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates -- cgit v1.2.3