From 1799e35d5baab6e06168b46cc78b968e728ea3d1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Sep 2007 23:34:46 +0200 Subject: sched: add /proc/sys/kernel/sched_compat_yield add /proc/sys/kernel/sched_compat_yield to make sys_sched_yield() more agressive, by moving the yielding task to the last position in the rbtree. with sched_compat_yield=0: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2539 mingo 20 0 1576 252 204 R 50 0.0 0:02.03 loop_yield 2541 mingo 20 0 1576 244 196 R 50 0.0 0:02.05 loop with sched_compat_yield=1: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2584 mingo 20 0 1576 248 196 R 99 0.0 0:52.45 loop 2582 mingo 20 0 1576 256 204 R 0 0.0 0:00.00 loop_yield Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 5 +---- kernel/sched_fair.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/sysctl.c | 8 +++++++ 3 files changed, 66 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index deeb1f8e0c30..63e0971c8fbb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4550,10 +4550,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) - schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 892616bf2c77..c9fbe8e73a45 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -42,6 +42,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; */ unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; + /* * SCHED_BATCH wake-up granularity. * (default: 25 msec, units: nanoseconds) @@ -897,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) } /* - * sched_yield() support is very simple - we dequeue and enqueue + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *rightmost, *se = &p->se; + struct rb_node *parent; - __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0); + enqueue_entity(cfs_rq, &p->se, 0); + + return; + } + /* + * Find the rightmost entry in the rbtree: */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + do { + parent = *link; + link = &parent->rb_right; + } while (*link); + + rightmost = rb_entry(parent, struct sched_entity, run_node); + /* + * Already in the rightmost position? + */ + if (unlikely(rightmost == se)) + return; + + /* + * Minimally necessary key value to be last in the tree: + */ + se->fair_key = rightmost->fair_key + 1; + + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + /* + * Relink the task to the rightmost position: + */ + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6ace893c17c9..53a456ebf6d5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.1 From 9c95e7319ba98585ebb6d304eca2d56f401ed70c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 19 Sep 2007 23:34:46 +0200 Subject: sched: fix invalid sched_class use When using rt_mutex, a NULL pointer dereference is occurred at enqueue_task_rt. Here is a scenario; 1) there are two threads, the thread A is fair_sched_class and thread B is rt_sched_class. 2) Thread A is boosted up to rt_sched_class, because the thread A has a rt_mutex lock and the thread B is waiting the lock. 3) At this time, when thread A create a new thread C, the thread C has a rt_sched_class. 4) When doing wake_up_new_task() for the thread C, the priority of the thread C is out of the RT priority range, because the normal priority of thread A is not the RT priority. It makes data corruption by overflowing the rt_prio_array. The new thread C should be fair_sched_class. The new thread should be valid scheduler class before queuing. This patch fixes to set the suitable scheduler class. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63e0971c8fbb..6107a0cd6325 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1682,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->prio = effective_prio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || !current->se.on_rq) { -- cgit v1.2.1 From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Thu, 20 Sep 2007 12:40:16 -0700 Subject: signalfd simplification This simplifies signalfd code, by avoiding it to remain attached to the sighand during its lifetime. In this way, the signalfd remain attached to the sighand only during poll(2) (and select and epoll) and read(2). This also allows to remove all the custom "tsk == current" checks in kernel/signal.c, since dequeue_signal() will only be called by "current". I think this is also what Ben was suggesting time ago. The external effect of this, is that a thread can extract only its own private signals and the group ones. I think this is an acceptable behaviour, in that those are the signals the thread would be able to fetch w/out signalfd. Signed-off-by: Davide Libenzi Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 --------- kernel/fork.c | 2 +- kernel/signal.c | 8 +++----- 3 files changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 06b24b3aa370..993369ee94d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); - /* - * Notify that this sighand has been detached. This must - * be called with the tsk->sighand lock held. Also, this - * access tsk->sighand internally, so it must be called - * before tsk->sighand is reset. - */ - signalfd_detach_locked(tsk); - posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 7332e236d367..33f12f48684a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); + init_waitqueue_head(&sighand->signalfd_wqh); } void __init proc_caches_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 3169bed0b4d0..9fb91a32edda 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - if (likely(tsk == current)) - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); @@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } - if (likely(tsk == current)) - recalc_sigpending(); + recalc_sigpending(); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if (signr && likely(tsk == current) && + if (signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ /* -- cgit v1.2.1 From b7e113dc9d52c4a37d2da6fafe77959f3a28eccf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Sep 2007 22:29:06 +0000 Subject: clockevents: remove the suspend/resume workaround^Wthinko In a desparate attempt to fix the suspend/resume problem on Andrews VAIO I added a workaround which enforced the broadcast of the oneshot timer on resume. This was actually resolving the problem on the VAIO but was just a stupid workaround, which was not tackling the root cause: the assignement of lower idle C-States in the ACPI processor_idle code. The cpuidle patches, which utilize the dynamic tick feature and go faster into deeper C-states exposed the problem again. The correct solution is the previous patch, which prevents lower C-states across the suspend/resume. Remove the enforcement code, including the conditional broadcast timer arming, which helped to pamper over the real problem for quite a time. The oneshot broadcast flag for the cpu, which runs the resume code can never be set at the time when this code is executed. It only gets set, when the CPU is entering a lower idle C-State. Signed-off-by: Thomas Gleixner Tested-by: Andrew Morton Cc: Len Brown Cc: Venkatesh Pallipadi Cc: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/time/tick-broadcast.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aab881c86a1a..0962e0577660 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -382,23 +382,8 @@ static int tick_broadcast_set_event(ktime_t expires, int force) int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - int cpu = smp_processor_id(); - - /* - * If the CPU is marked for broadcast, enforce oneshot - * broadcast mode. The jinxed VAIO does not resume otherwise. - * No idea why it ends up in a lower C State during resume - * without notifying the clock events layer. - */ - if (cpu_isset(cpu, tick_broadcast_mask)) - cpu_set(cpu, tick_broadcast_oneshot_mask); - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - if(!cpus_empty(tick_broadcast_oneshot_mask)) - tick_broadcast_set_event(ktime_get(), 1); - - return cpu_isset(cpu, tick_broadcast_oneshot_mask); + return 0; } /* -- cgit v1.2.1 From 459685c75b82a0431da102365d507fdb72858b84 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Sep 2007 01:54:12 +0100 Subject: hibernation doesn't even build on frv - tons of helpers are missing Signed-off-by: Al Viro Acked-By: David Howells Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c8580a1e6873..14b0e10dc95c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -110,7 +110,7 @@ config SUSPEND config HIBERNATION_UP_POSSIBLE bool - depends on X86 || PPC64_SWSUSP || FRV || PPC32 + depends on X86 || PPC64_SWSUSP || PPC32 depends on !SMP default y -- cgit v1.2.1 From 4047727e5ae33f9b8d2b7766d1994ea6e5ec2991 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Mon, 1 Oct 2007 01:20:10 -0700 Subject: Fix SMP poweroff hangs We need to disable all CPUs other than the boot CPU (usually 0) before attempting to power-off modern SMP machines. This fixes the hang-on-poweroff issue on my MythTV SMP box, and also on Thomas Gleixner's new toybox. Signed-off-by: Mark Lord Acked-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 1b33b05d346b..8ae2e636eb1b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -878,6 +879,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); + disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); -- cgit v1.2.1 From 9f96cb1e8bca179a92afa40dfc3c49990f1cfc71 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 1 Oct 2007 01:20:13 -0700 Subject: robust futex thread exit race Calling handle_futex_death in exit_robust_list for the different robust mutexes of a thread basically frees the mutex. Another thread might grab the lock immediately which updates the next pointer of the mutex. fetch_robust_entry over the next pointer might therefore branch into the robust mutex list of a different thread. This can cause two problems: 1) some mutexes held by the dead thread are not getting freed and 2) some mutexs held by a different thread are freed. The next point need to be read before calling handle_futex_death. Signed-off-by: Martin Schwidefsky Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 26 ++++++++++++++++---------- kernel/futex_compat.c | 28 ++++++++++++++++++---------- 2 files changed, 34 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e8935b195e88..fcc94e7b4086 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1943,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -1965,11 +1966,13 @@ void exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); - + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: @@ -1978,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1991,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e52eb051f22..2c2e2954b713 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - compat_uptr_t uentry, upending; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); + next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: @@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t __user *)&entry->next, &pi)) + if (rc) return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) cond_resched(); } + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } asmlinkage long -- cgit v1.2.1 From 30084fbd1caa4b2e1a336fcdef60b68129d1d8f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Oct 2007 14:13:08 +0200 Subject: sched: fix profile=sleep fix sleep profiling - we lost this chunk in the CFS merge. Found-by: Mel Gorman Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c9fbe8e73a45..67c67a87146e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -639,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; + + /* + * Blocking time is in units of nanosecs, so shift by 20 to + * get a milliseconds-range estimation of the amount of + * time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), + delta >> 20); + } } #endif } -- cgit v1.2.1