summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArjan van de Ven <arjan@linux.intel.com>2008-01-25 21:08:34 +0100
committerIngo Molnar <mingo@elte.hu>2008-01-25 21:08:34 +0100
commit9745512ce79de686df354dc70a8d1a74d801892d (patch)
tree9b64e2b2e6d2ae534beef136922082f21701c7b9
parent326587b840785c60f5dc18557235a23bafefd620 (diff)
downloadtalos-obmc-linux-9745512ce79de686df354dc70a8d1a74d801892d.tar.gz
talos-obmc-linux-9745512ce79de686df354dc70a8d1a74d801892d.zip
sched: latencytop support
LatencyTOP kernel infrastructure; it measures latencies in the scheduler and tracks it system wide and per process. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/kernel/stacktrace.c27
-rw-r--r--fs/proc/base.c78
-rw-r--r--include/linux/latencytop.h44
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/stacktrace.h3
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/sched_fair.c8
-rw-r--r--kernel/sysctl.c10
-rw-r--r--lib/Kconfig.debug14
11 files changed, 429 insertions, 1 deletions
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6fa6cf036c70..55771fd7e545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr)
trace->entries[trace->nr_entries++] = addr;
}
+static void save_stack_address_nosched(void *data, unsigned long addr)
+{
+ struct stack_trace *trace = (struct stack_trace *)data;
+ if (in_sched_functions(addr))
+ return;
+ if (trace->skip > 0) {
+ trace->skip--;
+ return;
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = addr;
+}
+
static const struct stacktrace_ops save_stack_ops = {
.warning = save_stack_warning,
.warning_symbol = save_stack_warning_symbol,
@@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = {
.address = save_stack_address,
};
+static const struct stacktrace_ops save_stack_ops_nosched = {
+ .warning = save_stack_warning,
+ .warning_symbol = save_stack_warning_symbol,
+ .stack = save_stack_stack,
+ .address = save_stack_address_nosched,
+};
+
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
@@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+ dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7cc..91fa8e6ce8ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
}
#endif
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+ int i;
+ struct task_struct *task = m->private;
+ seq_puts(m, "Latency Top version : v0.1\n");
+
+ for (i = 0; i < 32; i++) {
+ if (task->latency_record[i].backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %li %li ",
+ task->latency_record[i].count,
+ task->latency_record[i].time,
+ task->latency_record[i].max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ char sym[KSYM_NAME_LEN];
+ char *c;
+ if (!task->latency_record[i].backtrace[q])
+ break;
+ if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+ break;
+ sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+ c = strchr(sym, '+');
+ if (c)
+ *c = 0;
+ seq_printf(m, "%s ", sym);
+ }
+ seq_printf(m, "\n");
+ }
+
+ }
+ return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct task_struct *task = get_proc_task(inode);
+
+ ret = single_open(file, lstats_show_proc, NULL);
+ if (!ret) {
+ m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ struct seq_file *m;
+ struct task_struct *task;
+
+ m = file->private_data;
+ task = m->private;
+ clear_all_latency_tracing(task);
+
+ return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
/* The badness from the OOM killer */
unsigned long badness(struct task_struct *p, unsigned long uptime);
static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
};
#endif
+
#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, lstats),
+#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, lstats),
+#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
new file mode 100644
index 000000000000..901c2d6377a8
--- /dev/null
+++ b/include/linux/latencytop.h
@@ -0,0 +1,44 @@
+/*
+ * latencytop.h: Infrastructure for displaying latency
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
+#define _INCLUDE_GUARD_LATENCYTOP_H_
+
+#ifdef CONFIG_LATENCYTOP
+
+#define LT_SAVECOUNT 32
+#define LT_BACKTRACEDEPTH 12
+
+struct latency_record {
+ unsigned long backtrace[LT_BACKTRACEDEPTH];
+ unsigned int count;
+ unsigned long time;
+ unsigned long max;
+};
+
+
+struct task_struct;
+
+void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+
+void clear_all_latency_tracing(struct task_struct *p);
+
+#else
+
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+}
+
+static inline void clear_all_latency_tracing(struct task_struct *p)
+{
+}
+
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index acadcab89ef9..dfc76e172f3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -88,6 +88,7 @@ struct sched_param {
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
#include <linux/kobject.h>
+#include <linux/latencytop.h>
#include <asm/processor.h>
@@ -1220,6 +1221,10 @@ struct task_struct {
int make_it_fail;
#endif
struct prop_local_single dirties;
+#ifdef CONFIG_LATENCYTOP
+ int latency_record_count;
+ struct latency_record latency_record[LT_SAVECOUNT];
+#endif
};
/*
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index e7fa657d0c49..5da9794b2d78 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -9,10 +9,13 @@ struct stack_trace {
};
extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_tsk(struct task_struct *tsk,
+ struct stack_trace *trace);
extern void print_stack_trace(struct stack_trace *trace, int spaces);
#else
# define save_stack_trace(trace) do { } while (0)
+# define save_stack_trace_tsk(tsk, trace) do { } while (0)
# define print_stack_trace(trace, spaces) do { } while (0)
#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 68755cd9a7e4..390d42146267 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c969f4fade0..39d22b3357de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1205,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
+ clear_all_latency_tracing(p);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (!latencytop_enabled)
+ return;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ memset(&p->latency_record, 0, sizeof(p->latency_record));
+ p->latency_record_count = 0;
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ memset(&latency_record, 0, sizeof(latency_record));
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+ int firstnonnull = MAXLR + 1;
+ int i;
+
+ if (!latencytop_enabled)
+ return;
+
+ /* skip kernel threads for now */
+ if (!tsk->mm)
+ return;
+
+ for (i = 0; i < MAXLR; i++) {
+ int q;
+ int same = 1;
+ /* Nothing stored: */
+ if (!latency_record[i].backtrace[0]) {
+ if (firstnonnull > i)
+ firstnonnull = i;
+ continue;
+ }
+ for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ if (latency_record[i].backtrace[q] !=
+ lat->backtrace[q])
+ same = 0;
+ if (same && lat->backtrace[q] == 0)
+ break;
+ if (same && lat->backtrace[q] == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ latency_record[i].count++;
+ latency_record[i].time += lat->time;
+ if (lat->time > latency_record[i].max)
+ latency_record[i].max = lat->time;
+ return;
+ }
+ }
+
+ i = firstnonnull;
+ if (i >= MAXLR - 1)
+ return;
+
+ /* Allocted a new one: */
+ memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ trace.max_entries = LT_BACKTRACEDEPTH;
+ trace.entries = &lat->backtrace[0];
+ trace.skip = 0;
+ save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+ unsigned long flags;
+ int i, q;
+ struct latency_record lat;
+
+ if (!latencytop_enabled)
+ return;
+
+ /* Long interruptible waits are generally user requested... */
+ if (inter && usecs > 5000)
+ return;
+
+ memset(&lat, 0, sizeof(lat));
+ lat.count = 1;
+ lat.time = usecs;
+ lat.max = usecs;
+ store_stacktrace(tsk, &lat);
+
+ spin_lock_irqsave(&latency_lock, flags);
+
+ account_global_scheduler_latency(tsk, &lat);
+
+ /*
+ * short term hack; if we're > 32 we stop; future we recycle:
+ */
+ tsk->latency_record_count++;
+ if (tsk->latency_record_count >= LT_SAVECOUNT)
+ goto out_unlock;
+
+ for (i = 0; i < LT_SAVECOUNT ; i++) {
+ struct latency_record *mylat;
+ int same = 1;
+ mylat = &tsk->latency_record[i];
+ for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ if (mylat->backtrace[q] !=
+ lat.backtrace[q])
+ same = 0;
+ if (same && lat.backtrace[q] == 0)
+ break;
+ if (same && lat.backtrace[q] == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ mylat->count++;
+ mylat->time += lat.time;
+ if (lat.time > mylat->max)
+ mylat->max = lat.time;
+ goto out_unlock;
+ }
+ }
+
+ /* Allocated a new one: */
+ i = tsk->latency_record_count;
+ memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "Latency Top version : v0.1\n");
+
+ for (i = 0; i < MAXLR; i++) {
+ if (latency_record[i].backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %li %li ",
+ latency_record[i].count,
+ latency_record[i].time,
+ latency_record[i].max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ char sym[KSYM_NAME_LEN];
+ char *c;
+ if (!latency_record[i].backtrace[q])
+ break;
+ if (latency_record[i].backtrace[q] == ULONG_MAX)
+ break;
+ sprint_symbol(sym, latency_record[i].backtrace[q]);
+ c = strchr(sym, '+');
+ if (c)
+ *c = 0;
+ seq_printf(m, "%s ", sym);
+ }
+ seq_printf(m, "\n");
+ }
+ }
+ return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *offs)
+{
+ clear_global_latency_tracing();
+
+ return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = create_proc_entry("latency_stats", 0644, NULL);
+ if (!pe)
+ return -ENOMEM;
+
+ pe->proc_fops = &lstats_fops;
+
+ return 0;
+}
+__initcall(init_lstats_procfs);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3dab1ff83c4f..1b3b40ad7c54 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
+#include <linux/latencytop.h>
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SCHEDSTATS
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
+
+ account_scheduler_latency(tsk, delta >> 10, 1);
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
- struct task_struct *tsk = task_of(se);
profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
delta >> 20);
}
+ account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3afbd25f43eb..5418ef61e16e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
extern int maps_protect;
extern int sysctl_stat_interval;
extern int audit_argv_kb;
+extern int latencytop_enabled;
/* Constants used for minimum and maximum */
#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -416,6 +417,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec_taint,
},
#endif
+#ifdef CONFIG_LATENCYTOP
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_SECURITY_CAPABILITIES
{
.procname = "cap-bound",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a60109307d32..14fb355e3caa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER
help
Provide stacktrace filter for fault-injection capabilities
+config LATENCYTOP
+ bool "Latency measuring infrastructure"
+ select FRAME_POINTER if !MIPS
+ select KALLSYMS
+ select KALLSYMS_ALL
+ select STACKTRACE
+ select SCHEDSTATS
+ select SCHED_DEBUG
+ depends on X86 || X86_64
+ help
+ Enable this option if you want to use the LatencyTOP tool
+ to find out which userspace is blocking on what kernel operations.
+
+
source "samples/Kconfig"
OpenPOWER on IntegriCloud