From 34b753007d646482a4125a7095e1d1986d395f95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 16 Oct 2016 21:08:02 +0200 Subject: perf bench futex: Cache align the worker struct It popped up in perf testing that the worker consumes some amount of CPU. It boils down to the increment of `ops` which causes cache line bouncing between the individual threads. This patch aligns the struct by 256 bytes to ensure that not a cache line is shared among CPUs. 128 byte is the x86 worst case and grep says that L1_CACHE_SHIFT is set to 8 on s390. Signed-off-by: Sebastian Andrzej Siewior Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20161016190803.3392-1-bigeasy@linutronix.de Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/perf/bench/futex-hash.c') diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 8024cd5febd2..d9e5e80bb4d0 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -39,12 +39,15 @@ static unsigned int threads_starting; static struct stats throughput_stats; static pthread_cond_t thread_parent, thread_worker; +#define SMP_CACHE_BYTES 256 +#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES))) + struct worker { int tid; u_int32_t *futex; pthread_t thread; unsigned long ops; -}; +} __cacheline_aligned; static const struct option options[] = { OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), -- cgit v1.2.1 From e2e1680fda1573ebfdd6bba5d58f978044746993 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 24 Oct 2016 13:56:52 -0700 Subject: perf bench futex: Avoid worker cacheline bouncing Sebastian noted that overhead for worker thread ops (throughput) accounting was producing 'perf' to appear in the profiles, consuming a non-trivial (i.e. 13%) amount of CPU. This is due to cacheline bouncing due to the increment of w->ops. We can easily fix this by just working on a local copy and updating the actual worker once done running, and ready to show the program summary. There is no danger of the worker being concurrent, so we can trust that no stale value is being seen by another thread. This also gets rid of the unnecessary cache alignment hack; its not worth it. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Davidlohr Bueso Acked-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1477342613-9938-2-git-send-email-dave@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'tools/perf/bench/futex-hash.c') diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index d9e5e80bb4d0..da04b8c5568a 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -39,15 +39,12 @@ static unsigned int threads_starting; static struct stats throughput_stats; static pthread_cond_t thread_parent, thread_worker; -#define SMP_CACHE_BYTES 256 -#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES))) - struct worker { int tid; u_int32_t *futex; pthread_t thread; unsigned long ops; -} __cacheline_aligned; +}; static const struct option options[] = { OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), @@ -66,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = { static void *workerfn(void *arg) { int ret; - unsigned int i; struct worker *w = (struct worker *) arg; + unsigned int i; + unsigned long ops = w->ops; /* avoid cacheline bouncing */ pthread_mutex_lock(&thread_lock); threads_starting--; @@ -77,7 +75,7 @@ static void *workerfn(void *arg) pthread_mutex_unlock(&thread_lock); do { - for (i = 0; i < nfutexes; i++, w->ops++) { + for (i = 0; i < nfutexes; i++, ops++) { /* * We want the futex calls to fail in order to stress * the hashing of uaddr and not measure other steps, @@ -91,6 +89,7 @@ static void *workerfn(void *arg) } } while (!done); + w->ops = ops; return NULL; } -- cgit v1.2.1 From 60758d6668b3e2fa8e5fd143d24d0425203d007e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 24 Oct 2016 13:56:53 -0700 Subject: perf bench futex: Sanitize numeric parameters This gets rid of oddities such as: perf bench futex hash -t -4 perf: calloc: Cannot allocate memory Runtime (and many more) are equally busted, i.e. run for bogus amounts of time. Just use the abs, instead of, for example errorring out. Committer note: After the patch: $ perf bench futex hash -t -4 # Running 'futex/hash' benchmark: Run summary [PID 10178]: 4 threads, each operating on 1024 [private] futexes for 10 secs. [thread 0] futexes: 0x34f9fa0 ... 0x34faf9c [ 4702208 ops/sec ] [thread 1] futexes: 0x34fb140 ... 0x34fc13c [ 4707020 ops/sec ] [thread 2] futexes: 0x34fc2e0 ... 0x34fd2dc [ 4711526 ops/sec ] [thread 3] futexes: 0x34fd480 ... 0x34fe47c [ 4709683 ops/sec ] Averaged 4707609 operations/sec (+- 0.04%), total secs = 10 $ Signed-off-by: Davidlohr Bueso Tested-by: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1477342613-9938-3-git-send-email-dave@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools/perf/bench/futex-hash.c') diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index da04b8c5568a..bfbb6b5f609c 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -130,6 +130,8 @@ int bench_futex_hash(int argc, const char **argv, } ncpus = sysconf(_SC_NPROCESSORS_ONLN); + nsecs = futexbench_sanitize_numeric(nsecs); + nfutexes = futexbench_sanitize_numeric(nfutexes); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; @@ -137,6 +139,8 @@ int bench_futex_hash(int argc, const char **argv, if (!nthreads) /* default to the number of CPUs */ nthreads = ncpus; + else + nthreads = futexbench_sanitize_numeric(nthreads); worker = calloc(nthreads, sizeof(*worker)); if (!worker) -- cgit v1.2.1