From febdbfe8a91ce0d11939d4940b592eb0dba8d663 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Feb 2014 18:16:07 +0100 Subject: arch: Prepare for smp_mb__{before,after}_atomic() Since the smp_mb__{before,after}*() ops are fundamentally dependent on how an arch can implement atomics it doesn't make sense to have 3 variants of them. They must all be the same. Furthermore, the 3 variants suggest they're only valid for those 3 atomic ops, while we have many more where they could be applied. So move away from smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() and reduce the interface to just the two: smp_mb__{before,after}_atomic(). This patch prepares the way by introducing default implementations in asm-generic/barrier.h that default to a full barrier and providing __deprecated inlines for the previous 6 barriers if they're not provided by the arch. This should allow for a mostly painless transition (lots of deprecated warns in the interim). Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-wr59327qdyi9mbzn6x937s4e@git.kernel.org Cc: Arnd Bergmann Cc: "Chen, Gong" Cc: John Sullivan Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Srinivas Pandruvada Cc: "Theodore Ts'o" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..8a70ec091760 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,6 +90,22 @@ #define CREATE_TRACE_POINTS #include +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; -- cgit v1.2.3 From 143e1e28cb40bed836b0a06567208bd7347c9672 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:37 +0200 Subject: sched: Rework sched_domain topology definition We replace the old way to configure the scheduler topology with a new method which enables a platform to declare additionnal level (if needed). We still have a default topology table definition that can be used by platform that don't want more level than the SMT, MC, CPU and NUMA ones. This table can be overwritten by an arch which either wants to add new level where a load balance make sense like BOOK or powergating level or wants to change the flags configuration of some levels. For each level, we need a function pointer that returns cpumask for each cpu, a function pointer that returns the flags for the level and a name. Only flags that describe topology, can be set by an architecture. The current topology flags are: SD_SHARE_CPUPOWER SD_SHARE_PKG_RESOURCES SD_NUMA SD_ASYM_PACKING Then, each level must be a subset on the next one. The build sequence of the sched_domain will take care of removing useless levels like those with 1 CPU and those with the same CPU span and no more relevant information for load balancing than its children. Signed-off-by: Vincent Guittot Tested-by: Dietmar Eggemann Reviewed-by: Preeti U Murthy Reviewed-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Chris Metcalf Cc: Christoph Lameter Cc: David S. Miller Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Hanjun Guo Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Tony Luck Cc: linux390@de.ibm.com Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 24 ---- arch/s390/include/asm/topology.h | 2 - arch/tile/include/asm/topology.h | 33 ------ include/linux/sched.h | 53 +++++++++ include/linux/topology.h | 128 +++------------------ kernel/sched/core.c | 233 ++++++++++++++++++++------------------- 6 files changed, 186 insertions(+), 287 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 5cb55a1e606b..3202aa74e0d6 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -46,30 +46,6 @@ void build_cpu_to_node_map(void); -#define SD_CPU_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 05425b18c0aa..07763bdb408d 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -64,8 +64,6 @@ static inline void s390_init_cpu_topology(void) }; #endif -#define SD_BOOK_INIT SD_CPU_INIT - #include #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d15c0d8d550f..938311844233 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node) /* For now, use numa node -1 for global allocation. */ #define pcibus_to_node(bus) ((void)(bus), -1) -/* - * TILE architecture has many cores integrated in one processor, so we need - * setup bigger balance_interval for both CPU/NODE scheduling domains to - * reduce process scheduling costs. - */ - -/* sched_domains SD_CPU_INIT for TILE architecture */ -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 4, \ - .max_interval = 128, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 32, \ -} - /* By definition, we create nodes based on online memory. */ #define node_has_online_mem(nid) 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a4298fb0d85..656b035c30e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -879,6 +879,27 @@ enum cpu_idle_type { extern int __weak arch_sd_sibiling_asym_packing(void); +#ifdef CONFIG_SCHED_SMT +static inline const int cpu_smt_flags(void) +{ + return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_SCHED_MC +static inline const int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline const int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif + struct sched_domain_attr { int relax_domain_level; }; @@ -985,6 +1006,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern struct sched_domain_topology_level *sched_domain_topology; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7062330a1329..973671ff9e7d 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -66,121 +66,6 @@ int arch_update_cpu_topology(void); #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif -/* - * Below are the 3 major initializers used in building sched_domains: - * SD_SIBLING_INIT, for SMT domains - * SD_CPU_INIT, for SMP domains - * - * Any architecture that cares to do any tuning to these values should do so - * by defining their own arch-specific initializer in include/asm/topology.h. - * A definition there will automagically override these default initializers - * and allow arch-specific performance tuning of sched_domains. - * (Only non-zero and non-null fields need be specified.) - */ - -#ifdef CONFIG_SCHED_SMT -/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, - * so can't we drop this in favor of CONFIG_SCHED_SMT? - */ -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 1*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - | arch_sd_sibling_asym_packing() \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_SMT */ - -#ifdef CONFIG_SCHED_MC -/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ -#ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_MC */ - -/* Common values for CPUs */ -#ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 1*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif - -#ifdef CONFIG_SCHED_BOOK -#ifndef SD_BOOK_INIT -#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! -#endif -#endif /* CONFIG_SCHED_BOOK */ - #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); @@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu) #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifdef CONFIG_SCHED_SMT +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +static inline const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + + #endif /* _LINUX_TOPOLOGY_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1cccfc..7d332b7899cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5566,17 +5566,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5589,21 +5578,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5832,34 +5806,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5947,99 +5893,156 @@ static void claim_allocations(int cpu, struct sched_domain *sd) *per_cpu_ptr(sdd->sgp, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) - #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUPOWER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE + | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUPOWER) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + sd->flags |= arch_sd_sibling_asym_packing(); + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +#ifdef CONFIG_SCHED_BOOK + { cpu_book_mask, SD_INIT_NAME(BOOK) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6183,7 +6186,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6191,18 +6197,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6360,7 +6367,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; -- cgit v1.2.3 From 2dfd747629e65f89d2dbceb92fffc763f66228b2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:38 +0200 Subject: sched, s390: Create a dedicated topology table BOOK level is only relevant for s390 so we create a dedicated topology table with BOOK level and remove it from default table. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Philipp Hachtmann Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: dietmar.eggemann@arm.com Cc: preeti@linux.vnet.ibm.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: linux390@de.ibm.com Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/s390/include/asm/topology.h | 11 +---------- arch/s390/kernel/topology.c | 20 ++++++++++++++++++++ kernel/sched/core.c | 3 --- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 07763bdb408d..56af53093d24 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; #define mc_capable() 1 -static inline const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_mask; -} - -static inline const struct cpumask *cpu_book_mask(int cpu) -{ - return &cpu_topology[cpu].book_mask; -} - int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); void topology_expect_change(void); +const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_BOOK */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 6298fed11ced..1860ad3cbc04 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -443,6 +443,23 @@ int topology_cpu_init(struct cpu *cpu) return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); } +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int __init topology_init(void) { if (!MACHINE_HAS_TOPOLOGY) { @@ -451,6 +468,9 @@ static int __init topology_init(void) } set_topology_timer(); out: + + set_sched_topology(s390_topology); + return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d332b7899cc..e59e5aec745a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6023,9 +6023,6 @@ static struct sched_domain_topology_level default_topology[] = { #endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif -#ifdef CONFIG_SCHED_BOOK - { cpu_book_mask, SD_INIT_NAME(BOOK) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, -- cgit v1.2.3 From 607b45e9a216e89a63351556e488eea06be0ff48 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:39 +0200 Subject: sched, powerpc: Create a dedicated topology table Create a dedicated topology table for handling asymetric feature of powerpc. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U. Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: dietmar.eggemann@arm.com Cc: devicetree@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1397209481-28542-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/smp.c | 31 +++++++++++++++++++++++-------- include/linux/sched.h | 2 -- kernel/sched/core.c | 6 ------ 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e2a4232c5871..10ffffef0414 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static const int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -} + set_sched_topology(powerpc_topology); -int arch_sd_sibling_asym_packing(void) -{ - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - return SD_ASYM_PACKING; - } - return 0; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/include/linux/sched.h b/include/linux/sched.h index 656b035c30e5..439a153b8403 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -877,8 +877,6 @@ enum cpu_idle_type { #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ -extern int __weak arch_sd_sibiling_asym_packing(void); - #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e59e5aec745a..7e348e238bf1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5796,11 +5796,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -5981,7 +5976,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) if (sd->flags & SD_SHARE_CPUPOWER) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ - sd->flags |= arch_sd_sibling_asym_packing(); } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit v1.2.3 From d77b3ed5c9f8ebedf154b52b5e943c461f3d37e6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:40 +0200 Subject: sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain A new flag SD_SHARE_POWERDOMAIN is created to reflect whether groups of CPUs in a sched_domain level can or not reach different power state. As an example, the flag should be cleared at CPU level if groups of cores can be power gated independently. This information can be used in the load balance decision or to add load balancing level between group of CPUs that can power gate independantly. This flag is part of the topology flags that can be set by arch. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1397209481-28542-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 439a153b8403..accb66bfd722 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,6 +870,7 @@ enum cpu_idle_type { #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e348e238bf1..1c9c3b7b26af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5261,7 +5261,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5292,7 +5293,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5901,6 +5903,7 @@ static int sched_domains_curr_level; * SD_SHARE_CPUPOWER - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -5909,7 +5912,8 @@ static int sched_domains_curr_level; (SD_SHARE_CPUPOWER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING) + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) -- cgit v1.2.3 From fd99f91aa007ba255aac44fe6cf21c1db398243a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 15:35:08 +0200 Subject: sched/idle: Avoid spurious wakeup IPIs Because mwait_idle_with_hints() gets called from !idle context it must call current_clr_polling(). This however means that resched_task() is very likely to send an IPI even when we were polling: CPU0 CPU1 if (current_set_polling_and_test()) goto out; __monitor(&ti->flags); if (!need_resched()) __mwait(eax, ecx); set_tsk_need_resched(p); smp_mb(); out: current_clr_polling(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); So while it is correct (extra IPIs aren't a problem, whereas a missed IPI would be) it is a performance problem (for some). Avoid this issue by using fetch_or() to atomically set NEED_RESCHED and test if POLLING_NRFLAG is set. Since a CPU stuck in mwait is unlikely to modify the flags word, contention on the cmpxchg is unlikely and thus we should mostly succeed in a single go. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kf5suce6njh5xf5d3od13rr0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1c9c3b7b26af..4b82622b6252 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,6 +505,39 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -521,17 +554,15 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); } -- cgit v1.2.3 From ac1bea85781e9004da9b3e8a4b097c18492d857c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 16 Mar 2014 21:36:25 -0700 Subject: sched,rcu: Make cond_resched() report RCU quiescent states Given a CPU running a loop containing cond_resched(), with no other tasks runnable on that CPU, RCU will eventually report RCU CPU stall warnings due to lack of quiescent states. Fortunately, every call to cond_resched() is a perfectly good quiescent state. Unfortunately, invoking rcu_note_context_switch() is a bit heavyweight for cond_resched(), especially given the need to disable preemption, and, for RCU-preempt, interrupts as well. This commit therefore maintains a per-CPU counter that causes cond_resched(), cond_resched_lock(), and cond_resched_softirq() to call rcu_note_context_switch(), but only about once per 256 invocations. This ratio was chosen in keeping with the relative time constants of RCU grace periods. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 18 ++++++++++++++++++ kernel/sched/core.c | 7 ++++++- 3 files changed, 60 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 82973738125b..97cc8d6679b4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,7 @@ #include #include #include +#include #include extern int rcu_expedited; /* for sysctl */ @@ -286,6 +287,41 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ +DECLARE_PER_CPU(int, rcu_cond_resched_count); +void rcu_resched(void); + +/* + * Is it time to report RCU quiescent states? + * + * Note unsynchronized access to rcu_cond_resched_count. Yes, we might + * increment some random CPU's count, and possibly also load the result from + * yet another CPU's count. We might even clobber some other CPU's attempt + * to zero its counter. This is all OK because the goal is not precision, + * but rather reasonable amortization of rcu_note_context_switch() overhead + * and extremely high probability of avoiding RCU CPU stall warnings. + * Note that this function has to be preempted in just the wrong place, + * many thousands of times in a row, for anything bad to happen. + */ +static inline bool rcu_should_resched(void) +{ + return raw_cpu_inc_return(rcu_cond_resched_count) >= + RCU_COND_RESCHED_LIM; +} + +/* + * Report quiscent states to RCU if it is time to do so. + */ +static inline void rcu_cond_resched(void) +{ + if (unlikely(rcu_should_resched())) + rcu_resched(); +} + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..ed7a0d72562c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -338,3 +338,21 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +DEFINE_PER_CPU(int, rcu_cond_resched_count); + +/* + * Report a set of RCU quiescent states, for use by cond_resched() + * and friends. Out of line due to being called infrequently. + */ +void rcu_resched(void) +{ + preempt_disable(); + __this_cpu_write(rcu_cond_resched_count, 0); + rcu_note_context_switch(smp_processor_id()); + preempt_enable(); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..9f530c9ed911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4051,6 +4051,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { + rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4069,15 +4070,18 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { + bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched) { + if (spin_needbreak(lock) || resched || need_rcu_resched) { spin_unlock(lock); if (resched) __cond_resched(); + else if (unlikely(need_rcu_resched)) + rcu_resched(); else cpu_relax(); ret = 1; @@ -4091,6 +4095,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); + rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); -- cgit v1.2.3 From e78c7bca56dab5ce4f22694f99115ec07e4935f6 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:28 +0200 Subject: sched: Simplify return logic in sched_copy_attr() The logic in this function is a little contorted, clean it up: * Rather than having chained gotos for the -EFBIG case, just return -EFBIG directly. * Now, the label 'out' is no longer needed, and 'ret' must be zero zero by the time we fall through to this point, so just return 0. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC24.9080201@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2551b6db470e..2318fc484d8b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3657,13 +3657,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** -- cgit v1.2.3 From 22400674945c562cf3c176d6c4178cd545e2dec9 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:33 +0200 Subject: sched: Simplify return logic in sched_read_attr() Gotos are chained pointlessly here, and the 'out' label can be dispensed with. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC29.9090503@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2318fc484d8b..a78c5b62a470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3821,7 +3821,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; @@ -3831,12 +3831,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** -- cgit v1.2.3 From c515db8cd311ef77b2dc7cbd6b695022655bb0f3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 May 2014 11:11:01 +0200 Subject: sched/numa: Fix initialization of sched_domain_topology for NUMA Jet Chen has reported a kernel panics when booting qemu-system-x86_64 with kvm64 cpu. A panic occured while building the sched_domain. In sched_init_numa, we create a new topology table in which both default levels and numa levels are copied. The last row of the table must have a null pointer in the mask field. The current implementation doesn't add this last row in the computation of the table size. So we add 1 row in the allocation size that will be used as the last row of the table. The kzalloc will ensure that the mask field is NULL. Reported-by: Jet Chen Tested-by: Jet Chen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1399972261-25693-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a78c5b62a470..45d077ed24fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6231,7 +6231,7 @@ static void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level) * + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit v1.2.3 From caffcdd8d27ba78730d5540396ce72ad022aff2c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 30 Apr 2014 14:39:38 +0100 Subject: sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() There is no need to zero struct sched_group member cpumask and struct sched_group_power member power since both structures are already allocated as zeroed memory in __sdt_alloc(). This patch has been tested with BUG_ON(!cpumask_empty(sched_group_cpus(sg))); and BUG_ON(sg->sgp->power); in build_sched_groups() on ARM TC2 and INTEL i5 M520 platform including CPU hotplug scenarios. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398865178-12577-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45d077ed24fb..6340c601475d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5794,8 +5794,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { -- cgit v1.2.3 From a9467fa3cd2d5bf39e7cb7d0706d29d7ef4df212 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:35:15 +0900 Subject: sched: Use clamp() and clamp_val() to make sys_nice() more readable Suggested-by: Kees Cook Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399541715-19568-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6340c601475d..f5605b6deea4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,17 +3057,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); nice = task_nice(current) + increment; - if (nice < MIN_NICE) - nice = MIN_NICE; - if (nice > MAX_NICE) - nice = MAX_NICE; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; -- cgit v1.2.3 From a803f0261bb2bb57aab5542af3174db43b2a3887 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 8 May 2014 13:47:39 -0500 Subject: sched: Initialize rq->age_stamp on processor start If the sched_clock time starts at a large value, the kernel will spin in sched_avg_update for a long time while rq->age_stamp catches up with rq->clock. The comment in kernel/sched/clock.c says that there is no strict promise that it starts at zero. So initialize rq->age_stamp when a cpu starts up to avoid this. I was seeing long delays on a simulator that didn't start the clock at zero. This might also be an issue on reboots on processors that don't re-initialize the timer to zero on reset, and when using kexec. Signed-off-by: Corey Minyard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399574859-11714-1-git-send-email-minyard@acm.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5605b6deea4..da302ca98f60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5089,10 +5089,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6970,6 +6980,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); -- cgit v1.2.3 From 7aa2c016db2162defff77f6f5731bff3f25e5175 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:33:49 +0900 Subject: sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/a568a1e3cc8e78648f41b5035fa5e381d36274da.1399532322.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- drivers/staging/android/binder.c | 2 +- include/linux/sched/prio.h | 16 ++++++++++++++++ kernel/sched/core.c | 2 +- kernel/sys.c | 6 +++--- 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 179b21b66504..9311bb67ec35 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -436,7 +436,7 @@ static void binder_set_nice(long nice) set_user_nice(current, nice); return; } - min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ac322583c820..d9cf5a5762d9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -41,4 +41,20 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + #endif /* _SCHED_PRIO_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da302ca98f60..321d800e4baa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) do_each_thread(g, p) { if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } -- cgit v1.2.3 From aac74dc495456412c4130a1167ce4beb6c1f0b38 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:40 -0700 Subject: printk: rename printk_sched to printk_deferred After learning we'll need some sort of deferred printk functionality in the timekeeping core, Peter suggested we rename the printk_sched function so it can be reused by needed subsystems. This only changes the function name. No logic changes. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 6 +++--- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/rt.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8752f7595b27..7847301e2837 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -128,9 +128,9 @@ asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* - * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ -__printf(1, 2) __cold int printk_sched(const char *fmt, ...); +__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -165,7 +165,7 @@ int printk(const char *s, ...) return 0; } static inline __printf(1, 2) __cold -int printk_sched(const char *s, ...) +int printk_deferred(const char *s, ...) { return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dc2b8bd9bc1e..35d9db251903 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2585,7 +2585,7 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { va_list args; int r; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6cc2c1..caf03e89a068 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1367,7 +1367,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..d17e1c48a79d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, if (!lag_once) { lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); + printk_deferred("sched: DL replenish lagged to much\n"); } dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..5d7667b37c21 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -896,7 +896,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (!once) { once = true; - printk_sched("sched: RT throttling activated\n"); + printk_deferred("sched: RT throttling activated\n"); } } else { /* -- cgit v1.2.3