diff options
-rw-r--r-- | Documentation/sched-stats.txt | 195 | ||||
-rw-r--r-- | include/linux/preempt.h | 44 | ||||
-rw-r--r-- | include/linux/sched.h | 23 | ||||
-rw-r--r-- | kernel/Kconfig.preempt | 3 | ||||
-rw-r--r-- | kernel/sched.c | 204 | ||||
-rw-r--r-- | kernel/sched_debug.c | 2 |
6 files changed, 365 insertions, 106 deletions
diff --git a/Documentation/sched-stats.txt b/Documentation/sched-stats.txt index 6f72021aae51..442e14d35dea 100644 --- a/Documentation/sched-stats.txt +++ b/Documentation/sched-stats.txt @@ -1,10 +1,11 @@ -Version 10 of schedstats includes support for sched_domains, which -hit the mainline kernel in 2.6.7. Some counters make more sense to be -per-runqueue; other to be per-domain. Note that domains (and their associated -information) will only be pertinent and available on machines utilizing -CONFIG_SMP. - -In version 10 of schedstat, there is at least one level of domain +Version 14 of schedstats includes support for sched_domains, which hit the +mainline kernel in 2.6.20 although it is identical to the stats from version +12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel +release). Some counters make more sense to be per-runqueue; other to be +per-domain. Note that domains (and their associated information) will only +be pertinent and available on machines utilizing CONFIG_SMP. + +In version 14 of schedstat, there is at least one level of domain statistics for each cpu listed, and there may well be more than one domain. Domains have no particular names in this implementation, but the highest numbered one typically arbitrates balancing across all the @@ -27,7 +28,7 @@ to write their own scripts, the fields are described here. CPU statistics -------------- -cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 +cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 NOTE: In the sched_yield() statistics, the active queue is considered empty if it has only one process in it, since obviously the process calling @@ -39,48 +40,20 @@ First four fields are sched_yield() statistics: 3) # of times just the expired queue was empty 4) # of times sched_yield() was called -Next four are schedule() statistics: - 5) # of times the active queue had at least one other process on it - 6) # of times we switched to the expired queue and reused it - 7) # of times schedule() was called - 8) # of times schedule() left the processor idle - -Next four are active_load_balance() statistics: - 9) # of times active_load_balance() was called - 10) # of times active_load_balance() caused this cpu to gain a task - 11) # of times active_load_balance() caused this cpu to lose a task - 12) # of times active_load_balance() tried to move a task and failed - -Next three are try_to_wake_up() statistics: - 13) # of times try_to_wake_up() was called - 14) # of times try_to_wake_up() successfully moved the awakening task - 15) # of times try_to_wake_up() attempted to move the awakening task - -Next two are wake_up_new_task() statistics: - 16) # of times wake_up_new_task() was called - 17) # of times wake_up_new_task() successfully moved the new task - -Next one is a sched_migrate_task() statistic: - 18) # of times sched_migrate_task() was called +Next three are schedule() statistics: + 5) # of times we switched to the expired queue and reused it + 6) # of times schedule() was called + 7) # of times schedule() left the processor idle -Next one is a sched_balance_exec() statistic: - 19) # of times sched_balance_exec() was called +Next two are try_to_wake_up() statistics: + 8) # of times try_to_wake_up() was called + 9) # of times try_to_wake_up() was called to wake up the local cpu Next three are statistics describing scheduling latency: - 20) sum of all time spent running by tasks on this processor (in ms) - 21) sum of all time spent waiting to run by tasks on this processor (in ms) - 22) # of tasks (not necessarily unique) given to the processor - -The last six are statistics dealing with pull_task(): - 23) # of times pull_task() moved a task to this cpu when newly idle - 24) # of times pull_task() stole a task from this cpu when another cpu - was newly idle - 25) # of times pull_task() moved a task to this cpu when idle - 26) # of times pull_task() stole a task from this cpu when another cpu - was idle - 27) # of times pull_task() moved a task to this cpu when busy - 28) # of times pull_task() stole a task from this cpu when another cpu - was busy + 10) sum of all time spent running by tasks on this processor (in jiffies) + 11) sum of all time spent waiting to run by tasks on this processor (in + jiffies) + 12) # of timeslices run on this cpu Domain statistics @@ -89,65 +62,95 @@ One of these is produced per domain for each cpu described. (Note that if CONFIG_SMP is not defined, *no* domains are utilized and these lines will not appear in the output.) -domain<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 The first field is a bit mask indicating what cpus this domain operates over. -The next fifteen are a variety of load_balance() statistics: - - 1) # of times in this domain load_balance() was called when the cpu - was idle - 2) # of times in this domain load_balance() was called when the cpu - was busy - 3) # of times in this domain load_balance() was called when the cpu - was just becoming idle - 4) # of times in this domain load_balance() tried to move one or more - tasks and failed, when the cpu was idle - 5) # of times in this domain load_balance() tried to move one or more - tasks and failed, when the cpu was busy - 6) # of times in this domain load_balance() tried to move one or more - tasks and failed, when the cpu was just becoming idle - 7) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was idle - 8) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was busy - 9) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was just becoming idle - 10) # of times in this domain load_balance() was called but did not find - a busier queue while the cpu was idle - 11) # of times in this domain load_balance() was called but did not find - a busier queue while the cpu was busy - 12) # of times in this domain load_balance() was called but did not find - a busier queue while the cpu was just becoming idle - 13) # of times in this domain a busier queue was found while the cpu was - idle but no busier group was found - 14) # of times in this domain a busier queue was found while the cpu was - busy but no busier group was found - 15) # of times in this domain a busier queue was found while the cpu was - just becoming idle but no busier group was found - -Next two are sched_balance_exec() statistics: - 17) # of times in this domain sched_balance_exec() successfully pushed - a task to a new cpu - 18) # of times in this domain sched_balance_exec() tried but failed to - push a task to a new cpu - -Next two are try_to_wake_up() statistics: - 19) # of times in this domain try_to_wake_up() tried to move a task based - on affinity and cache warmth - 20) # of times in this domain try_to_wake_up() tried to move a task based - on load balancing - +The next 24 are a variety of load_balance() statistics in grouped into types +of idleness (idle, busy, and newly idle): + + 1) # of times in this domain load_balance() was called when the + cpu was idle + 2) # of times in this domain load_balance() checked but found + the load did not require balancing when the cpu was idle + 3) # of times in this domain load_balance() tried to move one or + more tasks and failed, when the cpu was idle + 4) sum of imbalances discovered (if any) with each call to + load_balance() in this domain when the cpu was idle + 5) # of times in this domain pull_task() was called when the cpu + was idle + 6) # of times in this domain pull_task() was called even though + the target task was cache-hot when idle + 7) # of times in this domain load_balance() was called but did + not find a busier queue while the cpu was idle + 8) # of times in this domain a busier queue was found while the + cpu was idle but no busier group was found + + 9) # of times in this domain load_balance() was called when the + cpu was busy + 10) # of times in this domain load_balance() checked but found the + load did not require balancing when busy + 11) # of times in this domain load_balance() tried to move one or + more tasks and failed, when the cpu was busy + 12) sum of imbalances discovered (if any) with each call to + load_balance() in this domain when the cpu was busy + 13) # of times in this domain pull_task() was called when busy + 14) # of times in this domain pull_task() was called even though the + target task was cache-hot when busy + 15) # of times in this domain load_balance() was called but did not + find a busier queue while the cpu was busy + 16) # of times in this domain a busier queue was found while the cpu + was busy but no busier group was found + + 17) # of times in this domain load_balance() was called when the + cpu was just becoming idle + 18) # of times in this domain load_balance() checked but found the + load did not require balancing when the cpu was just becoming idle + 19) # of times in this domain load_balance() tried to move one or more + tasks and failed, when the cpu was just becoming idle + 20) sum of imbalances discovered (if any) with each call to + load_balance() in this domain when the cpu was just becoming idle + 21) # of times in this domain pull_task() was called when newly idle + 22) # of times in this domain pull_task() was called even though the + target task was cache-hot when just becoming idle + 23) # of times in this domain load_balance() was called but did not + find a busier queue while the cpu was just becoming idle + 24) # of times in this domain a busier queue was found while the cpu + was just becoming idle but no busier group was found + + Next three are active_load_balance() statistics: + 25) # of times active_load_balance() was called + 26) # of times active_load_balance() tried to move a task and failed + 27) # of times active_load_balance() successfully moved a task + + Next three are sched_balance_exec() statistics: + 28) sbe_cnt is not used + 29) sbe_balanced is not used + 30) sbe_pushed is not used + + Next three are sched_balance_fork() statistics: + 31) sbf_cnt is not used + 32) sbf_balanced is not used + 33) sbf_pushed is not used + + Next three are try_to_wake_up() statistics: + 34) # of times in this domain try_to_wake_up() awoke a task that + last ran on a different cpu in this domain + 35) # of times in this domain try_to_wake_up() moved a task to the + waking cpu because it was cache-cold on its own cpu anyway + 36) # of times in this domain try_to_wake_up() started passive balancing /proc/<pid>/schedstat ---------------- schedstats also adds a new /proc/<pid/schedstat file to include some of the same information on a per-process level. There are three fields in -this file correlating to fields 20, 21, and 22 in the CPU fields, but -they only apply for that process. +this file correlating for that process to: + 1) time spent on the cpu + 2) time spent waiting on a runqueue + 3) # of timeslices run on this cpu A program could be easily written to make use of these extra fields to report on how well a particular process or set of processes is faring under the scheduler's policies. A simple version of such a program is available at - http://eaglet.rain.com/rick/linux/schedstat/v10/latency.c + http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c diff --git a/include/linux/preempt.h b/include/linux/preempt.h index d0926d63406c..484988ed301e 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -8,6 +8,7 @@ #include <linux/thread_info.h> #include <linux/linkage.h> +#include <linux/list.h> #ifdef CONFIG_DEBUG_PREEMPT extern void fastcall add_preempt_count(int val); @@ -60,4 +61,47 @@ do { \ #endif +#ifdef CONFIG_PREEMPT_NOTIFIERS + +struct preempt_notifier; + +/** + * preempt_ops - notifiers called when a task is preempted and rescheduled + * @sched_in: we're about to be rescheduled: + * notifier: struct preempt_notifier for the task being scheduled + * cpu: cpu we're scheduled on + * @sched_out: we've just been preempted + * notifier: struct preempt_notifier for the task being preempted + * next: the task that's kicking us out + */ +struct preempt_ops { + void (*sched_in)(struct preempt_notifier *notifier, int cpu); + void (*sched_out)(struct preempt_notifier *notifier, + struct task_struct *next); +}; + +/** + * preempt_notifier - key for installing preemption notifiers + * @link: internal use + * @ops: defines the notifier functions to be called + * + * Usually used in conjunction with container_of(). + */ +struct preempt_notifier { + struct hlist_node link; + struct preempt_ops *ops; +}; + +void preempt_notifier_register(struct preempt_notifier *notifier); +void preempt_notifier_unregister(struct preempt_notifier *notifier); + +static inline void preempt_notifier_init(struct preempt_notifier *notifier, + struct preempt_ops *ops) +{ + INIT_HLIST_NODE(¬ifier->link); + notifier->ops = ops; +} + +#endif + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 33b9b4841ee7..2e490271acf6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -681,7 +681,7 @@ enum cpu_idle_type { #define SCHED_LOAD_SHIFT 10 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) -#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 5) +#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ @@ -786,6 +786,22 @@ extern int partition_sched_domains(cpumask_t *partition1, #endif /* CONFIG_SMP */ +/* + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of + * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a + * task of nice 0 or enough lower priority tasks to bring up the + * weighted_cpuload + */ +static inline int above_background_load(void) +{ + unsigned long cpu; + + for_each_online_cpu(cpu) { + if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) + return 1; + } + return 0; +} struct io_context; /* See blkdev.h */ struct cpuset; @@ -935,6 +951,11 @@ struct task_struct { struct sched_class *sched_class; struct sched_entity se; +#ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; +#endif + unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..6b066632e40c 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -63,3 +63,6 @@ config PREEMPT_BKL Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. +config PREEMPT_NOTIFIERS + bool + diff --git a/kernel/sched.c b/kernel/sched.c index 93cf241cfbe9..5c51d7e5dcc1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -53,6 +53,7 @@ #include <linux/percpu.h> #include <linux/kthread.h> #include <linux/seq_file.h> +#include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> #include <linux/tsacct_kern.h> @@ -263,8 +264,6 @@ struct rq { unsigned int clock_warps, clock_overflows; unsigned int clock_unstable_events; - struct sched_class *load_balance_class; - atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -385,13 +384,12 @@ static inline unsigned long long rq_clock(struct rq *rq) */ unsigned long long cpu_clock(int cpu) { - struct rq *rq = cpu_rq(cpu); unsigned long long now; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); - now = rq_clock(rq); - spin_unlock_irqrestore(&rq->lock, flags); + local_irq_save(flags); + now = rq_clock(cpu_rq(cpu)); + local_irq_restore(flags); return now; } @@ -1592,6 +1590,10 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->run_list); p->se.on_rq = 0; +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1673,6 +1675,63 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) task_rq_unlock(rq, &flags); } +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted + * and rescheduled + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -1685,8 +1744,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { + fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -1728,6 +1790,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -1768,7 +1831,7 @@ context_switch(struct rq *rq, struct task_struct *prev, { struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, next); + prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -5140,10 +5203,129 @@ static void migrate_dead_tasks(unsigned int dead_cpu) if (!next) break; migrate_dead(dead_cpu, next); + } } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, }, + {0,}, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, int ctl_name, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->ctl_name = ctl_name; + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time, + sizeof(long long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[10], 11, "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[12], 13, "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->ctl_name = i + 1; + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0755; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + snprintf(buf, 32, "cpu%d", i); + entry->ctl_name = i + 1; + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0755; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} +#else +static void init_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -6249,6 +6431,8 @@ void __init sched_init_smp(void) /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_sched_domain_sysctl(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); @@ -6335,6 +6519,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 29f2c21e7da2..42970f723a97 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -186,7 +186,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } -void sysrq_sched_debug_show(void) +static void sysrq_sched_debug_show(void) { sched_debug_show(NULL, NULL); } |