From 27d7be1801a4824ecccbc735593101d72c038f13 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 27 Feb 2017 14:28:15 -0800
Subject: ipc/sem.c: avoid using spin_unlock_wait()

a) The ACQUIRE in spin_lock() applies to the read, not to the store, at
   least for powerpc.  This forces to add a smp_mb() into the fast path.

b) The memory barrier provided by spin_unlock_wait() is right now arch
   dependent.

Therefore: Use spin_lock()/spin_unlock() instead of spin_unlock_wait().

Advantage: faster single op semop calls(), observed +8.9% on x86.  (the
other solution would be arch dependencies in ipc/sem).

Disadvantage: slower complex op semop calls, if (and only if) there are
no sleeping operations.

The next patch adds hysteresis, this further reduces the probability
that the slow path is used.

Link: http://lkml.kernel.org/r/1476851896-3590-2-git-send-email-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: <1vier1@web.de>
Cc: kernel test robot <xiaolong.ye@intel.com>
Cc: <felixh@informatik.uni-bremen.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/sem.c | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

(limited to 'ipc/sem.c')

diff --git a/ipc/sem.c b/ipc/sem.c
index 3ec5742b5640..fe5db1ed081b 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -278,24 +278,13 @@ static void complexmode_enter(struct sem_array *sma)
 		return;
 	}
 
-	/* We need a full barrier after seting complex_mode:
-	 * The write to complex_mode must be visible
-	 * before we read the first sem->lock spinlock state.
-	 */
-	smp_store_mb(sma->complex_mode, true);
+	sma->complex_mode = true;
 
 	for (i = 0; i < sma->sem_nsems; i++) {
 		sem = sma->sem_base + i;
-		spin_unlock_wait(&sem->lock);
+		spin_lock(&sem->lock);
+		spin_unlock(&sem->lock);
 	}
-	/*
-	 * spin_unlock_wait() is not a memory barriers, it is only a
-	 * control barrier. The code must pair with spin_unlock(&sem->lock),
-	 * thus just the control barrier is insufficient.
-	 *
-	 * smp_rmb() is sufficient, as writes cannot pass the control barrier.
-	 */
-	smp_rmb();
 }
 
 /*
@@ -361,14 +350,6 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 		 */
 		spin_lock(&sem->lock);
 
-		/*
-		 * See 51d7d5205d33
-		 * ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
-		 * A full barrier is required: the write of sem->lock
-		 * must be visible before the read is executed
-		 */
-		smp_mb();
-
 		if (!smp_load_acquire(&sma->complex_mode)) {
 			/* fast path successful! */
 			return sops->sem_num;
-- 
cgit v1.2.3


From 9de5ab8a2eeea9ae4b63b6f6353b415b93e020c0 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 27 Feb 2017 14:28:18 -0800
Subject: ipc/sem: add hysteresis

sysv sem has two lock modes: One with per-semaphore locks, one lock mode
with a single global lock for the whole array.  When switching from the
per-semaphore locks to the global lock, all per-semaphore locks must be
scanned for ongoing operations.

The patch adds a hysteresis for switching from the global lock to the
per semaphore locks.  This reduces how often the per-semaphore locks
must be scanned.

Compared to the initial patch, this is a simplified solution: Setting
USE_GLOBAL_LOCK_HYSTERESIS to 1 restores the current behavior.

In theory, a workload with exactly 10 simple sops and then one complex
op now scales a bit worse, but this is pure theory: If there is
concurrency, the it won't be exactly 10:1:10:1:10:1:...  If there is no
concurrency, then there is no need for scalability.

Link: http://lkml.kernel.org/r/1476851896-3590-3-git-send-email-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: <1vier1@web.de>
Cc: kernel test robot <xiaolong.ye@intel.com>
Cc: <felixh@informatik.uni-bremen.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h |  2 +-
 ipc/sem.c           | 86 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'ipc/sem.c')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index d0efd6e6c20a..4fc222f8755d 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
 	struct list_head	list_id;	/* undo requests on this array */
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
-	bool			complex_mode;	/* no parallel simple ops */
+	unsigned int		use_global_lock;/* >0: global lock required */
 };
 
 #ifdef CONFIG_SYSVIPC
diff --git a/ipc/sem.c b/ipc/sem.c
index fe5db1ed081b..e468cd1c12f0 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -158,23 +158,43 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMMSL_FAST	256 /* 512 bytes on stack */
 #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
 
+/*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS	10
+
 /*
  * Locking:
  * a) global sem_lock() for read/write
  *	sem_undo.id_next,
  *	sem_array.complex_count,
- *	sem_array.complex_mode
  *	sem_array.pending{_alter,_const},
  *	sem_array.sem_undo
  *
  * b) global or semaphore sem_lock() for read/write:
  *	sem_array.sem_base[i].pending_{const,alter}:
- *	sem_array.complex_mode (for read)
  *
  * c) special:
  *	sem_undo_list.list_proc:
  *	* undo_list->lock for write
  *	* rcu for read
+ *	use_global_lock:
+ *	* global sem_lock() for write
+ *	* either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
  */
 
 #define sc_semmsl	sem_ctls[0]
@@ -273,12 +293,16 @@ static void complexmode_enter(struct sem_array *sma)
 	int i;
 	struct sem *sem;
 
-	if (sma->complex_mode)  {
-		/* We are already in complex_mode. Nothing to do */
+	if (sma->use_global_lock > 0)  {
+		/*
+		 * We are already in global lock mode.
+		 * Nothing to do, just reset the
+		 * counter until we return to simple mode.
+		 */
+		sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 		return;
 	}
-
-	sma->complex_mode = true;
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 
 	for (i = 0; i < sma->sem_nsems; i++) {
 		sem = sma->sem_base + i;
@@ -299,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
 		 */
 		return;
 	}
-	/*
-	 * Immediately after setting complex_mode to false,
-	 * a simple op can start. Thus: all memory writes
-	 * performed by the current operation must be visible
-	 * before we set complex_mode to false.
-	 */
-	smp_store_release(&sma->complex_mode, false);
+	if (sma->use_global_lock == 1) {
+		/*
+		 * Immediately after setting use_global_lock to 0,
+		 * a simple op can start. Thus: all memory writes
+		 * performed by the current operation must be visible
+		 * before we set use_global_lock to 0.
+		 */
+		smp_store_release(&sma->use_global_lock, 0);
+	} else {
+		sma->use_global_lock--;
+	}
 }
 
 #define SEM_GLOBAL_LOCK	(-1)
@@ -335,22 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	 * Optimized locking is possible if no complex operation
 	 * is either enqueued or processed right now.
 	 *
-	 * Both facts are tracked by complex_mode.
+	 * Both facts are tracked by use_global_mode.
 	 */
 	sem = sma->sem_base + sops->sem_num;
 
 	/*
-	 * Initial check for complex_mode. Just an optimization,
+	 * Initial check for use_global_lock. Just an optimization,
 	 * no locking, no memory barrier.
 	 */
-	if (!sma->complex_mode) {
+	if (!sma->use_global_lock) {
 		/*
 		 * It appears that no complex operation is around.
 		 * Acquire the per-semaphore lock.
 		 */
 		spin_lock(&sem->lock);
 
-		if (!smp_load_acquire(&sma->complex_mode)) {
+		/* pairs with smp_store_release() */
+		if (!smp_load_acquire(&sma->use_global_lock)) {
 			/* fast path successful! */
 			return sops->sem_num;
 		}
@@ -360,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	/* slow path: acquire the full lock */
 	ipc_lock_object(&sma->sem_perm);
 
-	if (sma->complex_count == 0) {
-		/* False alarm:
-		 * There is no complex operation, thus we can switch
-		 * back to the fast path.
+	if (sma->use_global_lock == 0) {
+		/*
+		 * The use_global_lock mode ended while we waited for
+		 * sma->sem_perm.lock. Thus we must switch to locking
+		 * with sem->lock.
+		 * Unlike in the fast path, there is no need to recheck
+		 * sma->use_global_lock after we have acquired sem->lock:
+		 * We own sma->sem_perm.lock, thus use_global_lock cannot
+		 * change.
 		 */
 		spin_lock(&sem->lock);
+
 		ipc_unlock_object(&sma->sem_perm);
 		return sops->sem_num;
 	} else {
-		/* Not a false alarm, thus complete the sequence for a
-		 * full lock.
+		/*
+		 * Not a false alarm, thus continue to use the global lock
+		 * mode. No need for complexmode_enter(), this was done by
+		 * the caller that has set use_global_mode to non-zero.
 		 */
-		complexmode_enter(sma);
 		return SEM_GLOBAL_LOCK;
 	}
 }
@@ -476,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	}
 
 	sma->complex_count = 0;
-	sma->complex_mode = true; /* dropped by sem_unlock below */
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 	INIT_LIST_HEAD(&sma->pending_alter);
 	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
-- 
cgit v1.2.3


From 84f001e15737f8214b0f5f0f7dfec0fb1027938f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/wake_q.h>

We are going to split <linux/sched/wake_q.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/wake_q.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/wake_q.h    | 6 ++++++
 ipc/mqueue.c                    | 1 +
 ipc/msg.c                       | 1 +
 ipc/sem.c                       | 1 +
 kernel/futex.c                  | 1 +
 kernel/locking/mutex.c          | 1 +
 kernel/locking/rtmutex.c        | 1 +
 kernel/locking/rtmutex_common.h | 1 +
 kernel/locking/rwsem-xadd.c     | 3 ++-
 kernel/sched/sched.h            | 1 +
 10 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/wake_q.h

(limited to 'ipc/sem.c')

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
new file mode 100644
index 000000000000..6383b35e4eba
--- /dev/null
+++ b/include/linux/sched/wake_q.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_WAKE_Q_H
+#define _LINUX_SCHED_WAKE_Q_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4fdd97031431..40e448de8a7e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -35,6 +35,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/slab.h>
+#include <linux/sched/wake_q.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/ipc/msg.c b/ipc/msg.c
index e3e52ce01123..ecc387e573f6 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -31,6 +31,7 @@
 #include <linux/list.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/sched/wake_q.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
 #include <linux/seq_file.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index e468cd1c12f0..947dc2348271 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -82,6 +82,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/sched/wake_q.h>
 
 #include <linux/uaccess.h>
 #include "util.h"
diff --git a/kernel/futex.c b/kernel/futex.c
index b687cb22301c..8ddcf9ea953c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ad2d9e22697b..57f6311e2405 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -21,6 +21,7 @@
 #include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d340be3a488f..d4f798491361 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 990134617b4c..856dfff5c33a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,6 +13,7 @@
 #define __KERNEL_RTMUTEX_COMMON_H
 
 #include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
 
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2ad8d8dc3bb1..4fe8d8ad4396 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -10,10 +10,11 @@
  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  */
 #include <linux/rwsem.h>
-#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/osq_lock.h>
 
 #include "rwsem.h"
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b9cb260d9cf..683570739f46 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3