From fabf318e5e4bda0aca2b0d617b191884fda62703 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2010 21:04:57 +0100 Subject: sched: Fix fork vs hotplug vs cpuset namespaces There are a number of issues: 1) TASK_WAKING vs cgroup_clone (cpusets) copy_process(): sched_fork() child->state = TASK_WAKING; /* waiting for wake_up_new_task() */ if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); while (child->state == TASK_WAKING) cpu_relax(); will deadlock the system. 2) cgroup_clone (cpusets) vs copy_process So even if the above would work we still have: copy_process(): if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); ... p->cpus_allowed = current->cpus_allowed over-writing the modified cpus_allowed. 3) fork() vs hotplug if we unplug the child's cpu after the sanity check when the child gets attached to the task_list but before wake_up_new_task() shit will meet with fan. Solve all these issues by moving fork cpu selection into wake_up_new_task(). Reported-by: Serge E. Hallyn Tested-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra LKML-Reference: <1264106190.4283.1314.camel@laptop> Signed-off-by: Thomas Gleixner --- kernel/fork.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..f88bd984df35 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; -- cgit v1.2.1 From d11c563dd20ff35da5652c3e1c989d9e10e1d6d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2010 17:04:50 -0800 Subject: sched: Use lockdep-based checking on rcu_dereference() Update the rcu_dereference() usages to take advantage of the new lockdep-based checking. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1266887105-1528-6-git-send-email-paulmck@linux.vnet.ibm.com> [ -v2: fix allmodconfig missing symbol export build failure on x86 ] Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f88bd984df35..17bbf093356d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL_GPL(tasklist_lock); int nr_processes(void) { -- cgit v1.2.1 From db1466b3e1bd1727375cdbfcbea4bcce2f860f61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:56 -0800 Subject: rcu: Use wrapper function instead of exporting tasklist_lock Lockdep-RCU commit d11c563d exported tasklist_lock, which is not a good thing. This patch instead exports a function that uses lockdep to check whether tasklist_lock is held. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Christoph Hellwig LKML-Reference: <1267631219-8713-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..8691c540a470 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -EXPORT_SYMBOL_GPL(tasklist_lock); + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { -- cgit v1.2.1 From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..7616bcf107b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; -- cgit v1.2.1 From 5beb49305251e5669852ed541e8e2f2f7696c53e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 5 Mar 2010 13:42:07 -0800 Subject: mm: change anon_vma linking to fix multi-process server scalability issue The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel Cc: KOSAKI Motohiro Cc: Larry Woodman Cc: Lee Schermerhorn Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 7616bcf107b9..bab7b254ad39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -329,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; - anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -392,6 +394,8 @@ out: flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; +fail_nomem_anon_vma_fork: + mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: -- cgit v1.2.1 From 78d7d407b62a021e6d2e8dc24c0b90e390ab58a1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 5 Mar 2010 13:42:54 -0800 Subject: kernel core: use helpers for rlimits Make sure compiler won't do weird things with limits. E.g. fetching them twice may return 2 different values after writable limits are implemented. I.e. either use rlimit helpers added in commit 3e10e716abf3 ("resource: add helpers for fetching rlimits") or ACCESS_ONCE if not applicable. Signed-off-by: Jiri Slaby Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bab7b254ad39..b0ec34abc0bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -828,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand) */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { + unsigned long cpu_limit; + /* Thread group counters. */ thread_group_cputime_init(sig); @@ -842,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) sig->cputime_expires.virt_exp = cputime_zero; sig->cputime_expires.sched_exp = 0; - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (cpu_limit != RLIM_INFINITY) { + sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; } @@ -1037,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= - p->signal->rlim[RLIMIT_NPROC].rlim_cur) { + task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; -- cgit v1.2.1 From a56704ef6b0c5796c9ff38cc78aa232dfb9644d7 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:01 -0800 Subject: copy_signal() cleanup: use zalloc and remove initializations Use kmem_cache_zalloc() on signal creation and remove unneeded initialization lines in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b0ec34abc0bb..ce2666f84d85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -863,7 +863,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_THREAD) return 0; - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; @@ -871,46 +871,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); - sig->flags = 0; if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; - sig->group_exit_code = 0; - sig->group_exit_task = NULL; - sig->group_stop_count = 0; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = NULL; - sig->tty = NULL; - - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; - sig->gtime = cputime_zero; - sig->cgtime = cputime_zero; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - sig->prev_utime = sig->prev_stime = cputime_zero; -#endif - sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; - sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; - sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->maxrss = sig->cmaxrss = 0; - task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - taskstats_tgid_init(sig); - task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); - tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; -- cgit v1.2.1 From 93c59907c6f247d09239135caecf294a106a2ae0 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:03 -0800 Subject: copy_signal() cleanup: clean thread_group_cputime_init() Remove unneeded initializations in thread_group_cputime_init() and in posix_cpu_timers_init_group(). They are useless after kmem_cache_zalloc() was used in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index ce2666f84d85..1beb6c303c41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -833,17 +833,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - /* Expiration times and increments. */ - sig->it[CPUCLOCK_PROF].expires = cputime_zero; - sig->it[CPUCLOCK_PROF].incr = cputime_zero; - sig->it[CPUCLOCK_VIRT].expires = cputime_zero; - sig->it[CPUCLOCK_VIRT].incr = cputime_zero; - - /* Cached expiration times. */ - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); -- cgit v1.2.1 From a3a2e76c77fa22b114e421ac11dec0c56c3503fb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 6 Apr 2010 14:34:42 -0700 Subject: mm: avoid null-pointer deref in sync_mm_rss() - We weren't zeroing p->rss_stat[] at fork() - Consequently sync_mm_rss() was dereferencing tsk->mm for kernel threads and was oopsing. - Make __sync_task_rss_stat() static, too. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=15648 [akpm@linux-foundation.org: remove the BUG_ON(!mm->rss)] Reported-by: Troels Liebe Bentsen Signed-off-by: KAMEZAWA Hiroyuki "Michael S. Tsirkin" Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f0e6d0..44b0791b0a2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1052,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif +#if defined(SPLIT_RSS_COUNTING) + memset(&p->rss_stat, 0, sizeof(p->rss_stat)); +#endif p->default_timer_slack_ns = current->timer_slack_ns; -- cgit v1.2.1