From 97fb7a0a8944bd6d2c5634e1e0fa689a5c40bc22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 14:01:12 +0100 Subject: sched: Clean up and harmonize the coding style of the scheduler code base A good number of small style inconsistencies have accumulated in the scheduler core, so do a pass over them to harmonize all these details: - fix speling in comments, - use curly braces for multi-line statements, - remove unnecessary parentheses from integer literals, - capitalize consistently, - remove stray newlines, - add comments where necessary, - remove invalid/unnecessary comments, - align structure definitions and other data types vertically, - add missing newlines for increased readability, - fix vertical tabulation where it's misaligned, - harmonize preprocessor conditional block labeling and vertical alignment, - remove line-breaks where they uglify the code, - add newline after local variable definitions, No change in functionality: md5: 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 88 ++++++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 54 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..7c82a9b88510 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -9,7 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include #include #include @@ -274,34 +273,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[13] is terminator */ return table; @@ -332,8 +316,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static cpumask_var_t sd_sysctl_cpus; +static struct ctl_table_header *sd_sysctl_header; void register_sched_domain_sysctl(void) { @@ -413,14 +397,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group { struct sched_entity *se = tg->se[cpu]; -#define P(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) -#define PN(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -428,6 +408,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); + if (schedstat_enabled()) { PN_SCHEDSTAT(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.sleep_start); @@ -440,6 +421,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN_SCHEDSTAT(se->statistics.wait_sum); P_SCHEDSTAT(se->statistics.wait_count); } + P(se->load.weight); P(se->runnable_weight); #ifdef CONFIG_SMP @@ -464,6 +446,7 @@ static char *task_group_path(struct task_group *tg) return group_path; cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@ -799,9 +782,9 @@ void sysrq_sched_debug_show(void) /* * This itererator needs some explanation. * It returns 1 for the header position. - * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * This means 2 is CPU 0. + * In a hotplugged system some CPUs, including CPU 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *sched_debug_start(struct seq_file *file, loff_t *offset) { @@ -821,6 +804,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } @@ -835,10 +819,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) } static const struct seq_operations sched_debug_sops = { - .start = sched_debug_start, - .next = sched_debug_next, - .stop = sched_debug_stop, - .show = sched_debug_show, + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, }; static int sched_debug_release(struct inode *inode, struct file *file) @@ -876,14 +860,10 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From 325ea10c0809406ce23f038602abbc454f3f761d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 12:20:47 +0100 Subject: sched/headers: Simplify and clean up header usage in the scheduler Do the following cleanups and simplifications: - sched/sched.h already includes , so no need to include it in sched/core.c again. - order the headers alphabetically - add all headers to kernel/sched/sched.h - remove all unnecessary includes from the .c files that are already included in kernel/sched/sched.h. Finally, make all scheduler .c files use a single common header: #include "sched.h" ... which now contains a union of the relied upon headers. This makes the various .c files easier to read and easier to handle. Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 6 --- kernel/sched/autogroup.c | 9 ++--- kernel/sched/autogroup.h | 4 -- kernel/sched/clock.c | 14 +------ kernel/sched/completion.c | 5 +-- kernel/sched/core.c | 40 +++++++------------- kernel/sched/cpuacct.c | 13 +------ kernel/sched/cpudeadline.c | 5 +-- kernel/sched/cpudeadline.h | 2 - kernel/sched/cpufreq.c | 1 - kernel/sched/cpufreq_schedutil.c | 8 +--- kernel/sched/cpupri.c | 6 +-- kernel/sched/cpupri.h | 1 - kernel/sched/cputime.c | 10 ++--- kernel/sched/deadline.c | 3 -- kernel/sched/debug.c | 11 +----- kernel/sched/fair.c | 16 +------- kernel/sched/idle.c | 15 +------- kernel/sched/idle_task.c | 5 +-- kernel/sched/isolation.c | 7 ---- kernel/sched/loadavg.c | 4 -- kernel/sched/membarrier.c | 9 +---- kernel/sched/rt.c | 4 -- kernel/sched/sched.h | 81 ++++++++++++++++++++++++++-------------- kernel/sched/stats.c | 13 +++---- kernel/sched/swait.c | 3 +- kernel/sched/topology.c | 4 -- kernel/sched/wait.c | 9 +---- kernel/sched/wait_bit.c | 5 +-- 29 files changed, 94 insertions(+), 219 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index a5bc8728ead7..0cb034331cbb 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,8 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SCHED_DEADLINE_H -#define _LINUX_SCHED_DEADLINE_H - -#include /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } - -#endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index ff1b7b647b86..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - +/* + * Auto-group scheduling implementation: + */ #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 49e6ec9559cf..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,10 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP -#include -#include -#include - struct autogroup { /* * Reference doesn't mean how many threads attach to this diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 7da6bec8a2ff..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -52,19 +52,7 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..5d2d56b0817a 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,10 +11,7 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ - -#include -#include -#include +#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9427b59551c1..e1e334ba8ff9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,11 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include #include -#ifdef CONFIG_PARAVIRT -#include -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1abd325e733a..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,22 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sched.h" - /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cb172b61d191..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,10 +10,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include "cpudeadline.h" +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index c26e7a0e5a66..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,6 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include -#include #define IDX_INVALID -1 diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include "sched.h" DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0dad8160e00f..feb5f89020f2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -11,14 +11,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include - #include "sched.h" +#include + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index f43e14ccb67d..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -26,11 +26,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include -#include "cpupri.h" +#include "sched.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 141a06c914c6..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d3b450b57ade..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,10 +1,6 @@ -#include -#include -#include -#include -#include -#include -#include +/* + * Simple CPU accounting cgroup controller + */ #include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58f8b7b37983..af491f537636 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,9 +17,6 @@ */ #include "sched.h" -#include -#include - struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7c82a9b88510..644d9a464380 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,7 +1,7 @@ /* * kernel/sched/debug.c * - * Print the CFS rbtree + * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * @@ -9,15 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include -#include -#include - #include "sched.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f877de96c9b..f5591071ae98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,24 +20,10 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include -#include "sched.h" - /* * Targeted preemption latency for CPU-bound tasks: * diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 343d25f85477..2760e0357271 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,23 +1,10 @@ /* * Generic entry points for the idle threads */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#include "sched.h" #include -#include "sched.h" - /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index ec73680922f8..488222ac4651 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * idle-task scheduling class. * - * (NOTE: these are not related to SCHED_IDLE tasks which are + * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are * handled in sched/fair.c) */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index aad5f48a07c6..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -6,13 +6,6 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include -#include -#include -#include -#include -#include - #include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a398e7e28a8a..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,10 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ - -#include -#include - #include "sched.h" /* diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2c6ae2413fa2..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -13,14 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include -#include -#include -#include -#include - -#include "sched.h" /* for cpu_rq(). */ +#include "sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40498872111..a3d438fec46c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ - #include "sched.h" -#include -#include - int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bd1461ae06e4..23ba4dd76ac4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,39 +3,71 @@ * Scheduler internal types and methods: */ #include + #include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include #include -#include -#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include + +#include -#include -#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include #ifdef CONFIG_PARAVIRT -#include +# include #endif #include "cpupri.h" @@ -1357,13 +1389,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) return p->on_rq == TASK_ON_RQ_MIGRATING; } -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - /* * wake flags */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 968c1fe3099a..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - +/* + * /proc/schedstat implementation + */ #include "sched.h" /* - * bump this up when changing the output format or the meaning of an existing + * Current schedstat API version. + * + * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 15 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b88ab4e0207f..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,8 +2,7 @@ /* * (simple wait queues ) implementation: */ -#include -#include +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 219eee70e457..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,10 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include -#include -#include - #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7b2a142ae629..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -3,14 +3,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 5293c59163a6..4239c78f5cd3 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,10 +1,7 @@ /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include -#include -#include -#include +#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -- cgit v1.2.3 From 7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 9 Mar 2018 09:52:42 +0000 Subject: sched/fair: Add util_est on top of PELT The util_avg signal computed by PELT is too variable for some use-cases. For example, a big task waking up after a long sleep period will have its utilization almost completely decayed. This introduces some latency before schedutil will be able to pick the best frequency to run a task. The same issue can affect task placement. Indeed, since the task utilization is already decayed at wakeup, when the task is enqueued in a CPU, this can result in a CPU running a big task as being temporarily represented as being almost empty. This leads to a race condition where other tasks can be potentially allocated on a CPU which just started to run a big task which slept for a relatively long period. Moreover, the PELT utilization of a task can be updated every [ms], thus making it a continuously changing value for certain longer running tasks. This means that the instantaneous PELT utilization of a RUNNING task is not really meaningful to properly support scheduler decisions. For all these reasons, a more stable signal can do a better job of representing the expected/estimated utilization of a task/cfs_rq. Such a signal can be easily created on top of PELT by still using it as an estimator which produces values to be aggregated on meaningful events. This patch adds a simple implementation of util_est, a new signal built on top of PELT's util_avg where: util_est(task) = max(task::util_avg, f(task::util_avg@dequeue)) This allows to remember how big a task has been reported by PELT in its previous activations via f(task::util_avg@dequeue), which is the new _task_util_est(struct task_struct*) function added by this patch. If a task should change its behavior and it runs longer in a new activation, after a certain time its util_est will just track the original PELT signal (i.e. task::util_avg). The estimated utilization of cfs_rq is defined only for root ones. That's because the only sensible consumer of this signal are the scheduler and schedutil when looking for the overall CPU utilization due to FAIR tasks. For this reason, the estimated utilization of a root cfs_rq is simply defined as: util_est(cfs_rq) = max(cfs_rq::util_avg, cfs_rq::util_est::enqueued) where: cfs_rq::util_est::enqueued = sum(_task_util_est(task)) for each RUNNABLE task on that root cfs_rq It's worth noting that the estimated utilization is tracked only for objects of interests, specifically: - Tasks: to better support tasks placement decisions - root cfs_rqs: to better support both tasks placement decisions as well as frequencies selection Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20180309095245.11071-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 ++++++++++++ kernel/sched/debug.c | 4 ++ kernel/sched/fair.c | 122 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 5 ++ 4 files changed, 154 insertions(+), 6 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 21b1168da951..f228c6033832 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -274,6 +274,34 @@ struct load_weight { u32 inv_weight; }; +/** + * struct util_est - Estimation utilization of FAIR tasks + * @enqueued: instantaneous estimated utilization of a task/cpu + * @ewma: the Exponential Weighted Moving Average (EWMA) + * utilization of a task + * + * Support data structure to track an Exponential Weighted Moving Average + * (EWMA) of a FAIR task's utilization. New samples are added to the moving + * average each time a task completes an activation. Sample's weight is chosen + * so that the EWMA will be relatively insensitive to transient changes to the + * task's workload. + * + * The enqueued attribute has a slightly different meaning for tasks and cpus: + * - task: the task's util_avg at last task dequeue time + * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU + * Thus, the util_est.enqueued of a task represents the contribution on the + * estimated utilization of the CPU where that task is currently enqueued. + * + * Only for tasks we track a moving average of the past instantaneous + * estimated utilization. This allows to absorb sporadic drops in utilization + * of an otherwise almost periodic task. + */ +struct util_est { + unsigned int enqueued; + unsigned int ewma; +#define UTIL_EST_WEIGHT_SHIFT 2 +}; + /* * The load_avg/util_avg accumulates an infinite geometric series * (see __update_load_avg() in kernel/sched/fair.c). @@ -335,6 +363,7 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_load_avg; unsigned long util_avg; + struct util_est util_est; }; struct sched_statistics { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 644d9a464380..332303be4beb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", + cfs_rq->avg.util_est.enqueued); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); + P(se.avg.util_est.ewma); + P(se.avg.util_est.enqueued); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3582117e1580..22b59a7facd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, ue.enqueued); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += _task_util_est(p); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); +} + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + maring < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +{ + long last_ewma_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Update root cfs_rq's estimated utilization + * + * If *p is the last task then the root cfs_rq's estimated utilization + * of a CPU is 0 by definition. + */ + ue.enqueued = 0; + if (cfs_rq->nr_running) { + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + _task_util_est(p)); + } + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * Skip update of task's estimated utilization when its EWMA is + * already ~1% close to its last activation value. + */ + ue = p->se.avg.util_est; + ue.enqueued = task_util(p); + last_ewma_diff = ue.enqueued - ue.ewma; + if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + WRITE_ONCE(p->se.avg.util_est, ue); +} + #else /* CONFIG_SMP */ static inline int @@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} + #endif /* CONFIG_SMP */ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); + util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static inline unsigned long task_util(struct task_struct *p); static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) @@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline unsigned long task_util(struct task_struct *p) -{ - return p->se.avg.util_avg; -} - /* * cpu_util_wake: Compute CPU utilization with any contributions from * the waking task p removed. diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..c459a4b61544 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, false) -- cgit v1.2.3