From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner --- kernel/sched/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..23606ee961b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ -- cgit v1.2.3 From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Mar 2013 12:22:34 -0700 Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s try_to_wake_up_local() should only be invoked to wake up another task in the same runqueue and BUG_ON()s are used to enforce the rule. Missing try_to_wake_up_local() can stall workqueue execution but such stalls are likely to be finite either by another work item being queued or the one blocked getting unblocked. There's no reason to trigger BUG while holding rq lock crashing the whole system. Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s. Signed-off-by: Tejun Heo Acked-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b03cd2d4cd..306943f531a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { -- cgit v1.2.3 From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001 From: libin Date: Mon, 8 Apr 2013 14:39:12 +0800 Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on sysctl") was an incomplete bug fix. This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1] avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX on sysctl. Signed-off-by: Libin Cc: Cc: Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 306943f531a3..fa077929e315 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, -- cgit v1.2.3 From 28b4a521f618d9722bc780ea38b44718ce0fe283 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 5 Apr 2013 16:26:46 +0530 Subject: sched: Fix typo inside comment Fix typo: sched_domains_nume_distance -> sched_domains_numa_distance Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: patches@linaro.org Cc: robin.randhawa@arm.com Cc: Steve.Bannister@arm.com Cc: Liviu.Dudau@arm.com Cc: charles.garcia-tobin@arm.com Cc: arvind.chauhan@arm.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/cd8084746ac932106d6fa6be388b8f2d6aa9617c.1365159023.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 849deb96e61e..f5e1aa5b2684 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6252,7 +6252,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ -- cgit v1.2.3 From ee761f629d598579594d7e1eb8c552f3c5f71e4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2013 22:49:32 +0100 Subject: arch: Consolidate tsk_is_polling() Move it to a common place. Preparatory patch for implementing set/clear for the idle need_resched poll implementation. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Rusty Russell Cc: Paul McKenney Cc: Peter Zijlstra Reviewed-by: Cc: Srivatsa S. Bhat Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130321215233.446034505@linutronix.de Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/thread_info.h | 2 -- arch/ia64/include/asm/thread_info.h | 2 -- arch/metag/include/asm/thread_info.h | 2 -- arch/microblaze/include/asm/thread_info.h | 1 - arch/mn10300/include/asm/thread_info.h | 2 -- arch/openrisc/include/asm/thread_info.h | 2 -- arch/parisc/include/asm/thread_info.h | 2 -- arch/powerpc/include/asm/thread_info.h | 2 -- arch/sh/include/asm/thread_info.h | 2 -- arch/sparc/include/asm/thread_info_32.h | 2 -- arch/sparc/include/asm/thread_info_64.h | 2 -- arch/tile/include/asm/thread_info.h | 2 -- arch/x86/include/asm/thread_info.h | 2 -- include/linux/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 5 ----- 15 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 1f8c72959fb6..52cd2a4a3ff4 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -95,8 +95,6 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TS_POLLING 0x0010 /* idle task polling need_resched, skip sending interrupt */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 020d655ed082..cade13dd0299 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -131,8 +131,6 @@ struct thread_info { #define TS_POLLING 1 /* true if in idle loop and not sleeping */ #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 0ecd34d8b5f6..7c4a33006142 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -150,6 +150,4 @@ static inline int kstack_end(void *addr) #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 008f30433d22..de26ea6373de 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -182,7 +182,6 @@ static inline bool test_and_clear_restore_sigmask(void) ti->status &= ~TS_RESTORE_SIGMASK; return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif #endif /* __KERNEL__ */ diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index f90062b0622d..224b4262486d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -165,8 +165,6 @@ void arch_release_thread_info(struct thread_info *ti); #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 07f3212422ad..d797acc901e4 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -128,8 +128,6 @@ register struct thread_info *current_thread_info_reg asm("r10"); /* For OpenRISC, this is anything in the LSW other than syscall trace */ #define _TIF_WORK_MASK (0xff & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index d1fb79a36f3d..6182832e5b6c 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -77,8 +77,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_BLOCKSTEP) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_PARISC_THREAD_INFO_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 406b7b9a1341..8ceea14d6fe4 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -182,8 +182,6 @@ static inline bool test_thread_local_flags(unsigned int flags) #define is_32bit_task() (1) #endif -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 7d5ac4e48485..45a93669289d 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -207,8 +207,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 25849ae3e900..dd3807599bb9 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -132,8 +132,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ _TIF_SIGPENDING) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 269bd92313df..d5e504251079 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -256,8 +256,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #define thread32_stack_is_64bit(__SP) (((__SP) & 0x1) != 0) #define test_thread_64bit_stack(__SP) \ ((test_thread_flag(TIF_32BIT) && !thread32_stack_is_64bit(__SP)) ? \ diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index e9c670d7a7fe..ccc8ef37235c 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -153,8 +153,6 @@ extern void _cpu_idle(void); #define TS_POLLING 0x0004 /* in idle loop but not sleeping */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2cd056e3ada3..a1df6e84691f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -241,8 +241,6 @@ static inline struct thread_info *current_thread_info(void) skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..6709a5813f27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2621,6 +2621,26 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Idle thread specific functions to determine the need_resched + * polling state. We have two versions, one based on TS_POLLING in + * thread_info.status and one based on TIF_POLLING_NRFLAG in + * thread_info.flags + */ +#ifdef TS_POLLING +static inline int tsk_is_polling(struct task_struct *p) +{ + return task_thread_info(p)->status & TS_POLLING; +} +#elif defined(TIF_POLLING_NRFLAG) +static inline int tsk_is_polling(struct task_struct *p) +{ + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +} +#else +static inline int tsk_is_polling(struct task_struct *p) { return 0; } +#endif + /* * Thread group CPU time accounting. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..243a20c5cf91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; -- cgit v1.2.3 From 2e76c24d72372db35f226a49c2b99d0fd8cfd400 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:36:31 +0800 Subject: sched: Split cpuacct code out of core.c Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5155366F.5060404@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 1 + kernel/sched/core.c | 220 ----------------------------------------------- kernel/sched/cpuacct.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 220 deletions(-) create mode 100644 kernel/sched/cpuacct.c (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5e1aa5b2684..c28222f72c80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8043,226 +8043,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..50ec24b6193d --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +struct cpuacct root_cpuacct; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (unlikely(!cpuacct_subsys.active)) + return; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + for (; ca; ca = parent_ca(ca)) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + } + + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, +}; -- cgit v1.2.3 From dbe4b41f9800223949ce72e4289814697e0ea91a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:36:55 +0800 Subject: sched/cpuacct: Add cpuacct_init() So we don't open-coded initialization of cpuacct in core.c. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553687.1060906@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++------ kernel/sched/cpuacct.c | 7 +++++++ kernel/sched/cpuacct.h | 5 +++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c28222f72c80..92930a89529d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,12 +6936,8 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif + cpuacct_init(); + for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 50ec24b6193d..48b5e9184dcc 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -218,6 +218,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +void __init cpuacct_init(void) +{ + root_cpuacct.cpustat = &kernel_cpustat; + root_cpuacct.cpuusage = alloc_percpu(u64); + BUG_ON(!root_cpuacct.cpuusage); /* Too early, not expected to fail */ +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index a7f3d4a8f535..551acd729562 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -41,10 +41,15 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return cgroup_ca(ca->css.cgroup->parent); } +extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); #else +static inline void cpuacct_init(void) +{ +} + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 14c6d3c8a47ced185b6375c4940b5b393f1a294e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:44:04 +0800 Subject: sched/cpuacct: Initialize root cpuacct earlier Now we don't need cpuacct_init(), and instead we just initialize root_cpuacct when it's defined. Signed-off-by: Li Zefan Cc: Tejun Heo Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553834.9090701@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- kernel/sched/cpuacct.c | 11 ++++------- kernel/sched/cpuacct.h | 5 ----- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 92930a89529d..ee8c1bd703fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,8 +6936,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ - cpuacct_init(); - for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index a691c4dd65be..04255814a0ed 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -59,7 +59,10 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); -static struct cpuacct root_cpuacct; +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) @@ -288,12 +291,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -void __init cpuacct_init(void) -{ - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = &root_cpuacct_cpuusage; -} - struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 51cd76eb4f0f..ed605624a5e7 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,15 +1,10 @@ #ifdef CONFIG_CGROUP_CPUACCT -extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); #else -static inline void cpuacct_init(void) -{ -} - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 41fcb9f230bf773656d1768b73000ef720bf00c3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 17 Apr 2013 15:23:11 -0400 Subject: mutex: Move mutex spinning code from sched/core.c back to mutex.c As mentioned by Ingo, the SCHED_FEAT_OWNER_SPIN scheduler feature bit was really just an early hack to make with/without mutex-spinning testable. So it is no longer necessary. This patch removes the SCHED_FEAT_OWNER_SPIN feature bit and move the mutex spinning code from kernel/sched/core.c back to kernel/mutex.c which is where they should belong. Signed-off-by: Waiman Long Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Chandramouleeswaran Aswin Cc: Davidlohr Bueso Cc: Norton Scott J Cc: Rik van Riel Cc: Paul E. McKenney Cc: David Howells Cc: Dave Jones Cc: Clark Williams Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366226594-5506-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/mutex.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 45 --------------------------------------------- kernel/sched/features.h | 7 ------- 4 files changed, 46 insertions(+), 53 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..aefe45d79f53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -320,7 +320,6 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..262d7177adad 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -95,6 +95,52 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..b37a22b99e0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2997,51 +2997,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,13 +45,6 @@ SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) -/* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - /* * Decrement CPU power based on time not spent running tasks */ -- cgit v1.2.3 From e6252c3ef4b9cd251b53f7b68035f395d20b044e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:41 +0900 Subject: sched: Rename load_balance_tmpmask to load_balance_mask This name doesn't represent specific meaning. So rename it to imply it's purpose. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee8c1bd703fe..cb49b2ab0e16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6873,7 +6873,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6910,7 +6910,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8ef321641df..5b1e96687b49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4977,7 +4977,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -5012,7 +5012,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, -- cgit v1.2.3