26 files changed, 746 insertions, 735 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-	/* only bpf_prog file descriptors can be stored in prog_array map */
+	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
 	return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
-		BUG_ON(array->prog[i] != NULL);
+		BUG_ON(array->ptrs[i] != NULL);
 	kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+				    void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *prog, *old_prog;
+	void *new_ptr, *old_ptr;
 	u32 index = *(u32 *)key, ufd;
 
 	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
-	if (!bpf_prog_array_compatible(array, prog)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
+	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	if (IS_ERR(new_ptr))
+		return PTR_ERR(new_ptr);
 
-	old_prog = xchg(array->prog + index, prog);
-	if (old_prog)
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (old_ptr)
+		map->ops->map_fd_put_ptr(old_ptr);
 
 	return 0;
 }
 
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *old_prog;
+	void *old_ptr;
 	u32 index = *(u32 *)key;
 
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
-	old_prog = xchg(array->prog + index, NULL);
-	if (old_prog) {
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, NULL);
+	if (old_ptr) {
+		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
 	} else {
 		return -ENOENT;
 	}
 }
 
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog = bpf_prog_get(fd);
+	if (IS_ERR(prog))
+		return prog;
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return ERR_PTR(-EINVAL);
+	}
+	return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+	struct bpf_prog *prog = ptr;
+
+	bpf_prog_put_rcu(prog);
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		prog_array_map_delete_elem(map, &i);
+		fd_array_map_delete_elem(map, &i);
 }
 
 static const struct bpf_map_ops prog_array_ops = {
-	.map_alloc = prog_array_map_alloc,
-	.map_free = prog_array_map_free,
+	.map_alloc = fd_array_map_alloc,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
-	.map_lookup_elem = prog_array_map_lookup_elem,
-	.map_update_elem = prog_array_map_update_elem,
-	.map_delete_elem = prog_array_map_delete_elem,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = prog_fd_array_get_ptr,
+	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
 static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__bpf_call_base);
 
 /**
  *	__bpf_prog_run - run eBPF program on a given context
@@ -449,11 +450,15 @@ select_insn:
 
 		tail_call_cnt++;
 
-		prog = READ_ONCE(array->prog[index]);
+		prog = READ_ONCE(array->ptrs[index]);
 		if (unlikely(!prog))
 			goto out;
 
-		ARG1 = BPF_R1;
+		/* ARG1 at this point is guaranteed to point to CTX from
+		 * the verifier side due to the fact that the tail call is
+		 * handeled like a helper, that is, bpf_tail_call_proto,
+		 * where arg1_type is ARG_PTR_TO_CTX.
+		 */
 		insn = prog->insnsi;
 		goto select_insn;
 out:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		 * release them all when user space closes prog_array_fd
 		 */
-		bpf_prog_array_map_clear(map);
+		bpf_fd_array_map_clear(map);
 
 	bpf_map_put(map);
 	return 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..ed12e385fb75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	struct verifier_state *state = &env->cur_state;
 	int size, err = 0;
 
+	if (state->regs[regno].type == PTR_TO_STACK)
+		off += state->regs[regno].imm;
+
 	size = bpf_size_to_bytes(bpf_size);
 	if (size < 0)
 		return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == FRAME_PTR) {
+	} else if (state->regs[regno].type == FRAME_PTR ||
+		   state->regs[regno].type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 	return 0;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	if (root != &cgrp_dfl_root)
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-				seq_printf(seq, ",%s", ss->legacy_name);
+				seq_show_option(seq, ss->name, NULL);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
 
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
-		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
-		seq_printf(seq, ",name=%s", root->name);
+		seq_show_option(seq, "name", root->name);
 	return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae16867670a9..e8183895691c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3222,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 }
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+	unsigned long flags;
+	u64 val;
+
+	/*
+	 * Disabling interrupts avoids all counter scheduling (context
+	 * switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
+	/* If this is a per-task event, it must be for current */
+	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+		     event->hw.target != current);
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+		     event->cpu != smp_processor_id());
+
+	/*
+	 * It must not be an event with inherit set, we cannot read
+	 * all child counters from atomic context.
+	 */
+	WARN_ON_ONCE(event->attr.inherit);
+
+	/*
+	 * It must not have a pmu::count method, those are not
+	 * NMI safe.
+	 */
+	WARN_ON_ONCE(event->pmu->count);
+
+	/*
+	 * If the event is currently on this CPU, its either a per-task event,
+	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+	 * oncpu == -1).
+	 */
+	if (event->oncpu == smp_processor_id())
+		event->pmu->read(event);
+
+	val = local64_read(&event->count);
+	local_irq_restore(flags);
+
+	return val;
+}
+
 static u64 perf_event_read(struct perf_event *event)
 {
 	/*
@@ -8718,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+	int err;
+	struct fd f;
+	struct perf_event *event;
+
+	err = perf_fget_light(fd, &f);
+	if (err)
+		return ERR_PTR(err);
+
+	event = f.file->private_data;
+	atomic_long_inc(&event->refcount);
+	fdput(f);
+
+	return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	if (!event)
+		return ERR_PTR(-EINVAL);
+
+	return &event->attr;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		tmp->vm_mm = mm;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
 		tmp->vm_next = tmp->vm_prev = NULL;
+		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
+#include <linux/fault-inject.h>
 
 #include <asm/futex.h>
 
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
 
 static struct futex_hash_bucket *futex_queues;
 
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+	struct fault_attr attr;
+
+	u32 ignore_private;
+} fail_futex = {
+	.attr = FAULT_ATTR_INITIALIZER,
+	.ignore_private = 0,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+	return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+	if (fail_futex.ignore_private && !fshared)
+		return false;
+
+	return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	struct dentry *dir;
+
+	dir = fault_create_debugfs_attr("fail_futex", NULL,
+					&fail_futex.attr);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	if (!debugfs_create_bool("ignore-private", mode, dir,
+				 &fail_futex.ignore_private)) {
+		debugfs_remove_recursive(dir);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+	return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
+
 static inline void futex_get_mm(union futex_key *key)
 {
 	atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
 		return -EFAULT;
 
+	if (unlikely(should_fail_futex(fshared)))
+		return -EFAULT;
+
 	/*
 	 * PROCESS_PRIVATE futexes are fast.
 	 * As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 	}
 
 again:
+	/* Ignore any VERIFY_READ mapping (futex common case) */
+	if (unlikely(should_fail_futex(fshared)))
+		return -EFAULT;
+
 	err = get_user_pages_fast(address, 1, 1, &page);
 	/*
 	 * If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
 		 */
-		if (ro) {
+		if (unlikely(should_fail_futex(fshared)) || ro) {
 			err = -EFAULT;
 			goto out;
 		}
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
 	u32 uninitialized_var(curval);
 
+	if (unlikely(should_fail_futex(true)))
+		return -EFAULT;
+
 	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 		return -EFAULT;
 
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 	if (get_futex_value_locked(&uval, uaddr))
 		return -EFAULT;
 
+	if (unlikely(should_fail_futex(true)))
+		return -EFAULT;
+
 	/*
 	 * Detect deadlocks.
 	 */
 	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
 		return -EDEADLK;
 
+	if ((unlikely(should_fail_futex(true))))
+		return -EDEADLK;
+
 	/*
 	 * Lookup existing state first. If it exists, try to attach to
 	 * its pi_state.
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	 */
 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
+	if (unlikely(should_fail_futex(true)))
+		ret = -EFAULT;
+
 	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 		ret = -EFAULT;
 	else if (curval != uval)
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
 
+	if (unlikely(should_fail_futex(true)))
+		return -EFAULT;
+
 	/*
 	 * Find the top_waiter and determine if there are additional waiters.
 	 * If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
  * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
  */
 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 			 ktime_t *time, int trylock)
@@ -2300,6 +2386,10 @@ retry_private:
 
 	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
 	if (unlikely(ret)) {
+		/*
+		 * Atomic work succeeded and we got the lock,
+		 * or failed. Either way, we do _not_ block.
+		 */
 		switch (ret) {
 		case 1:
 			/* We got the lock. */
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  * @uaddr:	the futex we initially wait on (non-pi)
  * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * 		the same type, no requeueing from private to shared, etc.
+ *		the same type, no requeueing from private to shared, etc.
  * @val:	the expected value of uaddr
  * @abs_time:	absolute timeout
  * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+			return -EFAULT;
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
 
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
 
 void static_key_slow_inc(struct static_key *key)
 {
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
 		return;
 
 	jump_label_lock();
-	if (atomic_read(&key->enabled) == 0) {
-		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_ENABLE);
-		else
-			jump_label_update(key, JUMP_LABEL_DISABLE);
-	}
-	atomic_inc(&key->enabled);
+	if (atomic_inc_return(&key->enabled) == 1)
+		jump_label_update(key);
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
 		atomic_inc(&key->enabled);
 		schedule_delayed_work(work, rate_limit);
 	} else {
-		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_DISABLE);
-		else
-			jump_label_update(key, JUMP_LABEL_ENABLE);
+		jump_label_update(key);
 	}
 	jump_label_unlock();
 }
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 	return 0;
 }
 
-/* 
+/*
  * Update code which is definitely not currently executing.
  * Architectures which need heavyweight synchronization to modify
  * running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
 					    enum jump_label_type type)
 {
-	arch_jump_label_transform(entry, type);	
+	arch_jump_label_transform(entry, type);
+}
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+	return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+	return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+	return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+	return (unsigned long)entry->key & 1UL;
+}
+
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+	struct static_key *key = jump_entry_key(entry);
+	bool enabled = static_key_enabled(key);
+	bool branch = jump_entry_branch(entry);
+
+	/* See the comment in linux/jump_label.h */
+	return enabled ^ branch;
 }
 
 static void __jump_label_update(struct static_key *key,
 				struct jump_entry *entry,
-				struct jump_entry *stop, int enable)
+				struct jump_entry *stop)
 {
-	for (; (entry < stop) &&
-	      (entry->key == (jump_label_t)(unsigned long)key);
-	      entry++) {
+	for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
 		/*
 		 * entry->code set to 0 invalidates module init text sections
 		 * kernel_text_address() verifies we are not in core kernel
 		 * init code, see jump_label_invalidate_module_init().
 		 */
 		if (entry->code && kernel_text_address(entry->code))
-			arch_jump_label_transform(entry, enable);
+			arch_jump_label_transform(entry, jump_label_type(entry));
 	}
 }
 
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
-	bool true_branch = jump_label_get_branch_default(key);
-	bool state = static_key_enabled(key);
-
-	if ((!true_branch && state) || (true_branch && !state))
-		return JUMP_LABEL_ENABLE;
-
-	return JUMP_LABEL_DISABLE;
-}
-
 void __init jump_label_init(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		struct static_key *iterk;
 
-		iterk = (struct static_key *)(unsigned long)iter->key;
-		arch_jump_label_transform_static(iter, jump_label_type(iterk));
+		/* rewrite NOPs */
+		if (jump_label_type(iter) == JUMP_LABEL_NOP)
+			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+		iterk = jump_entry_key(iter);
 		if (iterk == key)
 			continue;
 
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
 
 #ifdef CONFIG_MODULES
 
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+	struct static_key *key = jump_entry_key(entry);
+	bool type = static_key_type(key);
+	bool branch = jump_entry_branch(entry);
+
+	/* See the comment in linux/jump_label.h */
+	return type ^ branch;
+}
+
 struct static_key_mod {
 	struct static_key_mod *next;
 	struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
 				start, end);
 }
 
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
 {
-	struct static_key_mod *mod = key->next;
+	struct static_key_mod *mod;
 
-	while (mod) {
+	for (mod = key->next; mod; mod = mod->next) {
 		struct module *m = mod->mod;
 
 		__jump_label_update(key, mod->entries,
-				    m->jump_entries + m->num_jump_entries,
-				    enable);
-		mod = mod->next;
+				    m->jump_entries + m->num_jump_entries);
 	}
 }
 
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+		/* Only write NOPs for arch_branch_static(). */
+		if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
 	}
 }
 
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		struct static_key *iterk;
 
-		iterk = (struct static_key *)(unsigned long)iter->key;
+		iterk = jump_entry_key(iter);
 		if (iterk == key)
 			continue;
 
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
 		jlm->next = key->next;
 		key->next = jlm;
 
-		if (jump_label_type(key) == JUMP_LABEL_ENABLE)
-			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+		/* Only update if we've changed from our initial state */
+		if (jump_label_type(iter) != jump_label_init_type(iter))
+			__jump_label_update(key, iter, iter_stop);
 	}
 
 	return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
 	struct static_key_mod *jlm, **prev;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (iter->key == (jump_label_t)(unsigned long)key)
+		if (jump_entry_key(iter) == key)
 			continue;
 
-		key = (struct static_key *)(unsigned long)iter->key;
+		key = jump_entry_key(iter);
 
 		if (within_module(iter->key, mod))
 			continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
 	return ret;
 }
 
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
 {
 	struct jump_entry *stop = __stop___jump_table;
-	struct jump_entry *entry = jump_label_get_entries(key);
+	struct jump_entry *entry = static_key_entries(key);
 #ifdef CONFIG_MODULES
 	struct module *mod;
 
-	__jump_label_mod_update(key, enable);
+	__jump_label_mod_update(key);
 
 	preempt_disable();
 	mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
 #endif
 	/* if there are no users, entry can be NULL */
 	if (entry)
-		__jump_label_update(key, entry, stop, enable);
+		__jump_label_update(key, entry, stop);
 }
 
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		WARN_ON(static_key_enabled(&sk_true.key) != true);
+		WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+		WARN_ON(!static_branch_likely(&sk_true));
+		WARN_ON(!static_branch_unlikely(&sk_true));
+		WARN_ON(static_branch_likely(&sk_false));
+		WARN_ON(static_branch_unlikely(&sk_false));
+
+		static_branch_disable(&sk_true);
+		static_branch_enable(&sk_false);
+
+		WARN_ON(static_key_enabled(&sk_true.key) == true);
+		WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+		WARN_ON(static_branch_likely(&sk_true));
+		WARN_ON(static_branch_unlikely(&sk_true));
+		WARN_ON(!static_branch_likely(&sk_false));
+		WARN_ON(!static_branch_unlikely(&sk_false));
+
+		static_branch_enable(&sk_true);
+		static_branch_disable(&sk_false);
+	}
+
+	return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
  * kthread_create_on_node - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
  * @namefmt: printf-style name for the thread.
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
  *
  * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
  * standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..36942047ffc0 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
 {
 	while ((cnts & _QW_WMASK) == _QW_LOCKED) {
 		cpu_relax_lowlatency();
-		cnts = smp_load_acquire((u32 *)&lock->cnts);
+		cnts = atomic_read_acquire(&lock->cnts);
 	}
 }
 
 /**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
  * @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
  */
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 {
-	u32 cnts;
-
 	/*
 	 * Readers come here when they cannot get the lock without waiting
 	 */
 	if (unlikely(in_interrupt())) {
 		/*
-		 * Readers in interrupt context will spin until the lock is
-		 * available without waiting in the queue.
+		 * Readers in interrupt context will get the lock immediately
+		 * if the writer is just waiting (not holding the lock yet).
+		 * The rspin_until_writer_unlock() function returns immediately
+		 * in this case. Otherwise, they will spin (with ACQUIRE
+		 * semantics) until the lock is available without waiting in
+		 * the queue.
 		 */
-		cnts = smp_load_acquire((u32 *)&lock->cnts);
 		rspin_until_writer_unlock(lock, cnts);
 		return;
 	}
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
 	arch_spin_lock(&lock->lock);
 
 	/*
-	 * At the head of the wait queue now, wait until the writer state
-	 * goes to 0 and then try to increment the reader count and get
-	 * the lock. It is possible that an incoming writer may steal the
-	 * lock in the interim, so it is necessary to check the writer byte
-	 * to make sure that the write lock isn't taken.
+	 * The ACQUIRE semantics of the following spinning code ensure
+	 * that accesses can't leak upwards out of our subsequent critical
+	 * section in the case that the lock is currently held for write.
 	 */
-	while (atomic_read(&lock->cnts) & _QW_WMASK)
-		cpu_relax_lowlatency();
-
-	cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+	cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
 	rspin_until_writer_unlock(lock, cnts);
 
 	/*
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
 	 */
 	arch_spin_unlock(&lock->lock);
 }
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
 
 /**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
  * @lock : Pointer to queue rwlock structure
  */
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
 {
 	u32 cnts;
 
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 
 	/* Try to acquire the lock directly if no reader is present */
 	if (!atomic_read(&lock->cnts) &&
-	    (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+	    (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
 		goto unlock;
 
 	/*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 		struct __qrwlock *l = (struct __qrwlock *)lock;
 
 		if (!READ_ONCE(l->wmode) &&
-		   (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
+		   (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
 			break;
 
 		cpu_relax_lowlatency();
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 	for (;;) {
 		cnts = atomic_read(&lock->cnts);
 		if ((cnts == _QW_WAITING) &&
-		    (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
-				    _QW_LOCKED) == _QW_WAITING))
+		    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
+					    _QW_LOCKED) == _QW_WAITING))
 			break;
 
 		cpu_relax_lowlatency();
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 unlock:
 	arch_spin_unlock(&lock->lock);
 }
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_head(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 
@@ -440,7 +440,7 @@ queue:
 		cpu_relax();
 
 	arch_mcs_spin_unlock_contended(&next->locked);
-	pv_kick_node(next);
+	pv_kick_node(lock, next);
 
 release:
 	/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index df19ae4debd0..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,9 +22,14 @@
 
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
 enum vcpu_state {
 	vcpu_running = 0,
-	vcpu_halted,
+	vcpu_halted,		/* Used only in pv_wait_node */
+	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
 };
 
 struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
 
 /*
  * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
  */
 static void pv_wait_node(struct mcs_spinlock *node)
 {
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 *
 		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
 		 *     MB			      MB
-		 * [L] pn->locked		[RmW] pn->state = vcpu_running
+		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
 		 *
-		 * Matches the xchg() from pv_kick_node().
+		 * Matches the cmpxchg() from pv_kick_node().
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
 			pv_wait(&pn->state, vcpu_halted);
 
 		/*
-		 * Reset the vCPU state to avoid unncessary CPU kicking
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that value
+		 * so that pv_wait_head() knows to not also try to hash this lock.
 		 */
-		WRITE_ONCE(pn->state, vcpu_running);
+		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
 		/*
 		 * If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * MCS lock will be released soon.
 		 */
 	}
+
 	/*
 	 * By now our node->locked should be 1 and our caller will not actually
 	 * spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
  */
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct __qspinlock *l = (void *)lock;
 
 	/*
-	 * Note that because node->locked is already set, this actual
-	 * mcs_spinlock entry could be re-used already.
+	 * If the vCPU is indeed halted, advance its state to match that of
+	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+	 * observe its next->locked value and advance itself.
 	 *
-	 * This should be fine however, kicking people for no reason is
-	 * harmless.
+	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 */
+	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+		return;
+
+	/*
+	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
 	 *
-	 * See the comment in pv_wait_node().
+	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+	 * the hash table later on at unlock time, no atomic instruction is
+	 * needed.
 	 */
-	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
-		pv_kick(pn->cpu);
+	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	(void)pv_hash(lock, pn);
 }
 
 /*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	struct qspinlock **lp = NULL;
 	int loop;
 
+	/*
+	 * If pv_kick_node() already advanced our state, we don't need to
+	 * insert ourselves into the hash table anymore.
+	 */
+	if (READ_ONCE(pn->state) == vcpu_hashed)
+		lp = (struct qspinlock **)1;
+
 	for (;;) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
@@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			cpu_relax();
 		}
 
-		WRITE_ONCE(pn->state, vcpu_halted);
 		if (!lp) { /* ONCE */
+			WRITE_ONCE(pn->state, vcpu_hashed);
 			lp = pv_hash(lock, pn);
+
 			/*
-			 * lp must be set before setting _Q_SLOW_VAL
+			 * We must hash before setting _Q_SLOW_VAL, such that
+			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+			 * we'll be sure to be able to observe our hash entry.
 			 *
-			 * [S] lp = lock                [RmW] l = l->locked = 0
-			 *     MB                             MB
-			 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+			 *   [S] pn->state
+			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
+			 *       MB                           RMB
+			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
+			 *                                [L] pn->state
 			 *
-			 * Matches the cmpxchg() in __pv_queued_spin_unlock().
+			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
 			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
 				/*
@@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
-	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+	u8 locked;
 
 	/*
 	 * We must not unlock if SLOW, because in that case we must first
 	 * unhash. Otherwise it would be possible to have multiple @lock
 	 * entries, which would be BAD.
 	 */
-	if (likely(lockval == _Q_LOCKED_VAL))
+	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+	if (likely(locked == _Q_LOCKED_VAL))
 		return;
 
-	if (unlikely(lockval != _Q_SLOW_VAL)) {
-		if (debug_locks_silent)
-			return;
-		WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+	if (unlikely(locked != _Q_SLOW_VAL)) {
+		WARN(!debug_locks_silent,
+		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+		     (unsigned long)lock, atomic_read(&lock->val));
 		return;
 	}
 
 	/*
+	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+	 * so we need a barrier to order the read of the node data in
+	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+	 *
+	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+	 */
+	smp_rmb();
+
+	/*
 	 * Since the above failed to release, this must be the SLOW path.
 	 * Therefore start by looking up the blocked node and unhashing it.
 	 */
@@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
 	 * however we can still use the pv_node to kick the CPU.
+	 * The other vCPU may not really be halted, but kicking an active
+	 * vCPU is harmless other than the additional latency in completing
+	 * the unlock.
 	 */
-	if (READ_ONCE(node->state) == vcpu_halted)
+	if (READ_ONCE(node->state) == vcpu_hashed)
 		pv_kick(node->cpu);
 }
 /*
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS	8
-#define MAX_RT_TEST_MUTEXES	8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
-	int			opcode;
-	int			opdata;
-	int			mutexes[MAX_RT_TEST_MUTEXES];
-	int			event;
-	struct device		dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
-	RTTEST_NOP = 0,
-	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
-	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
-	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
-	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
-	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
-	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
-	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
-	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	/* 9, 10 - reserved for BKL commemoration */
-	RTTEST_SIGNAL = 11,	/* 11 Signal other test thread, data = thread id */
-	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
-	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
-	int i, id, ret = -EINVAL;
-
-	switch(td->opcode) {
-
-	case RTTEST_NOP:
-		return 0;
-
-	case RTTEST_LOCKCONT:
-		td->mutexes[td->opdata] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		return 0;
-
-	case RTTEST_RESET:
-		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
-			if (td->mutexes[i] == 4) {
-				rt_mutex_unlock(&mutexes[i]);
-				td->mutexes[i] = 0;
-			}
-		}
-		return 0;
-
-	case RTTEST_RESETEVENT:
-		atomic_set(&rttest_event, 0);
-		return 0;
-
-	default:
-		if (lockwakeup)
-			return ret;
-	}
-
-	switch(td->opcode) {
-
-	case RTTEST_LOCK:
-	case RTTEST_LOCKNOWAIT:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-			return ret;
-
-		td->mutexes[id] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		rt_mutex_lock(&mutexes[id]);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = 4;
-		return 0;
-
-	case RTTEST_LOCKINT:
-	case RTTEST_LOCKINTNOWAIT:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-			return ret;
-
-		td->mutexes[id] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = ret ? 0 : 4;
-		return ret ? -EINTR : 0;
-
-	case RTTEST_UNLOCK:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
-			return ret;
-
-		td->event = atomic_add_return(1, &rttest_event);
-		rt_mutex_unlock(&mutexes[id]);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = 0;
-		return 0;
-
-	default:
-		break;
-	}
-	return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
-	int tid, op, dat;
-	struct test_thread_data *td;
-
-	/* We have to lookup the task */
-	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
-		if (threads[tid] == current)
-			break;
-	}
-
-	BUG_ON(tid == MAX_RT_TEST_THREADS);
-
-	td = &thread_data[tid];
-
-	op = td->opcode;
-	dat = td->opdata;
-
-	switch (op) {
-	case RTTEST_LOCK:
-	case RTTEST_LOCKINT:
-	case RTTEST_LOCKNOWAIT:
-	case RTTEST_LOCKINTNOWAIT:
-		if (mutex != &mutexes[dat])
-			break;
-
-		if (td->mutexes[dat] != 1)
-			break;
-
-		td->mutexes[dat] = 2;
-		td->event = atomic_add_return(1, &rttest_event);
-		break;
-
-	default:
-		break;
-	}
-
-	schedule();
-
-
-	switch (op) {
-	case RTTEST_LOCK:
-	case RTTEST_LOCKINT:
-		if (mutex != &mutexes[dat])
-			return;
-
-		if (td->mutexes[dat] != 2)
-			return;
-
-		td->mutexes[dat] = 3;
-		td->event = atomic_add_return(1, &rttest_event);
-		break;
-
-	case RTTEST_LOCKNOWAIT:
-	case RTTEST_LOCKINTNOWAIT:
-		if (mutex != &mutexes[dat])
-			return;
-
-		if (td->mutexes[dat] != 2)
-			return;
-
-		td->mutexes[dat] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		return;
-
-	default:
-		return;
-	}
-
-	td->opcode = 0;
-
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (td->opcode > 0) {
-			int ret;
-
-			set_current_state(TASK_RUNNING);
-			ret = handle_op(td, 1);
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (td->opcode == RTTEST_LOCKCONT)
-				break;
-			td->opcode = ret;
-		}
-
-		/* Wait for the next command to be executed */
-		schedule();
-	}
-
-	/* Restore previous command and data */
-	td->opcode = op;
-	td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
-	struct test_thread_data *td = data;
-	int ret;
-
-	current->flags |= PF_MUTEX_TESTER;
-	set_freezable();
-	allow_signal(SIGHUP);
-
-	for(;;) {
-
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (td->opcode > 0) {
-			set_current_state(TASK_RUNNING);
-			ret = handle_op(td, 0);
-			set_current_state(TASK_INTERRUPTIBLE);
-			td->opcode = ret;
-		}
-
-		/* Wait for the next command to be executed */
-		schedule();
-		try_to_freeze();
-
-		if (signal_pending(current))
-			flush_signals(current);
-
-		if(kthread_should_stop())
-			break;
-	}
-	return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev:	thread reference
- * @buf:	command for actual step
- * @count:	length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct sched_param schedpar;
-	struct test_thread_data *td;
-	char cmdbuf[32];
-	int op, dat, tid, ret;
-
-	td = container_of(dev, struct test_thread_data, dev);
-	tid = td->dev.id;
-
-	/* strings from sysfs write are not 0 terminated! */
-	if (count >= sizeof(cmdbuf))
-		return -EINVAL;
-
-	/* strip of \n: */
-	if (buf[count-1] == '\n')
-		count--;
-	if (count < 1)
-		return -EINVAL;
-
-	memcpy(cmdbuf, buf, count);
-	cmdbuf[count] = 0;
-
-	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
-		return -EINVAL;
-
-	switch (op) {
-	case RTTEST_SCHEDOT:
-		schedpar.sched_priority = 0;
-		ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
-		if (ret)
-			return ret;
-		set_user_nice(current, 0);
-		break;
-
-	case RTTEST_SCHEDRT:
-		schedpar.sched_priority = dat;
-		ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
-		if (ret)
-			return ret;
-		break;
-
-	case RTTEST_SIGNAL:
-		send_sig(SIGHUP, threads[tid], 0);
-		break;
-
-	default:
-		if (td->opcode > 0)
-			return -EBUSY;
-		td->opdata = dat;
-		td->opcode = op;
-		wake_up_process(threads[tid]);
-	}
-
-	return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev:	thread to query
- * @buf:	char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
-				 char *buf)
-{
-	struct test_thread_data *td;
-	struct task_struct *tsk;
-	char *curr = buf;
-	int i;
-
-	td = container_of(dev, struct test_thread_data, dev);
-	tsk = threads[td->dev.id];
-
-	spin_lock(&rttest_lock);
-
-	curr += sprintf(curr,
-		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
-		td->opcode, td->event, tsk->state,
-			(MAX_RT_PRIO - 1) - tsk->prio,
-			(MAX_RT_PRIO - 1) - tsk->normal_prio,
-		tsk->pi_blocked_on);
-
-	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
-		curr += sprintf(curr, "%d", td->mutexes[i]);
-
-	spin_unlock(&rttest_lock);
-
-	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-			mutexes[td->dev.id].owner);
-
-	return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
-	.name = "rttest",
-	.dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
-	thread_data[id].dev.bus = &rttest_subsys;
-	thread_data[id].dev.id = id;
-
-	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
-	if (IS_ERR(threads[id]))
-		return PTR_ERR(threads[id]);
-
-	return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
-	int ret, i;
-
-	spin_lock_init(&rttest_lock);
-
-	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
-		rt_mutex_init(&mutexes[i]);
-
-	ret = subsys_system_register(&rttest_subsys, NULL);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
-		ret = init_test_thread(i);
-		if (ret)
-			break;
-		ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
-		if (ret)
-			break;
-		ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
-		if (ret)
-			break;
-	}
-
-	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
-	return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		debug_rt_mutex_print_deadlock(waiter);
 
-		schedule_rt_mutex(lock);
+		schedule();
 
 		raw_spin_lock(&lock->wait_lock);
 		set_current_state(state);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
 #include <linux/rtmutex.h>
 
 /*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of  complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock)				\
-  do {								\
-	if (!(current->flags & PF_MUTEX_TESTER))		\
-		schedule();					\
-	else							\
-		schedule_rt_mutex_test(_lock);			\
-  } while (0)
-
-#else
-# define schedule_rt_mutex(_lock)			schedule()
-#endif
-
-/*
  * This is the control structure for tasks blocked on a rt_mutex,
  * which is allocated on the kernel stack on of the blocked task.
  *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8420c233ff7..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 
 static void sched_feat_disable(int i)
 {
-	if (static_key_enabled(&sched_feat_keys[i]))
-		static_key_slow_dec(&sched_feat_keys[i]);
+	static_key_disable(&sched_feat_keys[i]);
 }
 
 static void sched_feat_enable(int i)
 {
-	if (!static_key_enabled(&sched_feat_keys[i]))
-		static_key_slow_inc(&sched_feat_keys[i]);
+	static_key_enable(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			  void *key)
 {
-	__wake_up_common(q, mode, 1, 0, key);
+	__wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, key);
+		__wake_up_locked_key(q, mode, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
 		if (kthread_should_stop()) {
 			__set_current_state(TASK_RUNNING);
 			preempt_enable();
-			if (ht->cleanup)
+			/* cleanup must mirror setup */
+			if (ht->cleanup && td->status != HP_THREAD_NONE)
 				ht->cleanup(td->cpu, cpu_online(td->cpu));
 			kfree(td);
 			return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
 	unsigned int cpu;
 
-	/* Unpark any threads that were voluntarily parked. */
-	for_each_cpu_not(cpu, ht->cpumask) {
-		if (cpu_online(cpu)) {
-			struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-			if (tsk)
-				kthread_unpark(tsk);
-		}
-	}
-
 	/* We need to destroy also the parked threads of offline cpus */
 	for_each_possible_cpu(cpu) {
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * 					    to hotplug
  * @plug_thread:	Hotplug thread descriptor
+ * @cpumask:		The cpumask where threads run
  *
  * Creates and starts the threads on all online cpus.
  */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+					   const struct cpumask *cpumask)
 {
 	unsigned int cpu;
 	int ret = 0;
 
 	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
 		return -ENOMEM;
-	cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+	cpumask_copy(plug_thread->cpumask, cpumask);
 
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 		ret = __smpboot_create_thread(plug_thread, cpu);
 		if (ret) {
 			smpboot_destroy_threads(plug_thread);
+			free_cpumask_var(plug_thread->cpumask);
 			goto out;
 		}
-		smpboot_unpark_thread(plug_thread, cpu);
+		if (cpumask_test_cpu(cpu, cpumask))
+			smpboot_unpark_thread(plug_thread, cpu);
 	}
 	list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -311,7 +308,7 @@ out:
 	put_online_cpus();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 
 /**
  * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
  * This is like the signal handler which runs in kernel mode, but it doesn't
  * try to wake up the @task.
  *
+ * Note: there is no ordering guarantee on works queued here.
+ *
  * RETURNS:
  * 0 if succeeds or -ESRCH.
  */
@@ -108,16 +110,6 @@ void task_work_run(void)
 		raw_spin_unlock_wait(&task->pi_lock);
 		smp_mb();
 
-		/* Reverse the list to run the works in fifo order */
-		head = NULL;
-		do {
-			next = work->next;
-			work->next = head;
-			head = work;
-			work = next;
-		} while (work);
-
-		work = head;
 		do {
 			next = work->next;
 			work->func(work);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 
 /*
  * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
  */
 static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 {
 	char *fmt = (char *) (long) r1;
+	bool str_seen = false;
 	int mod[3] = {};
 	int fmt_cnt = 0;
+	u64 unsafe_addr;
+	char buf[64];
 	int i;
 
 	/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 		if (fmt[i] == 'l') {
 			mod[fmt_cnt]++;
 			i++;
-		} else if (fmt[i] == 'p') {
+		} else if (fmt[i] == 'p' || fmt[i] == 's') {
 			mod[fmt_cnt]++;
 			i++;
 			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
 				return -EINVAL;
 			fmt_cnt++;
+			if (fmt[i - 1] == 's') {
+				if (str_seen)
+					/* allow only one '%s' per fmt string */
+					return -EINVAL;
+				str_seen = true;
+
+				switch (fmt_cnt) {
+				case 1:
+					unsafe_addr = r3;
+					r3 = (long) buf;
+					break;
+				case 2:
+					unsafe_addr = r4;
+					r4 = (long) buf;
+					break;
+				case 3:
+					unsafe_addr = r5;
+					r5 = (long) buf;
+					break;
+				}
+				buf[0] = 0;
+				strncpy_from_unsafe(buf,
+						    (void *) (long) unsafe_addr,
+						    sizeof(buf));
+			}
 			continue;
 		}
 
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 					    void *addr, void *dest)
 {
-	long ret;
 	int maxlen = get_rloc_len(*(u32 *)dest);
 	u8 *dst = get_rloc_data(dest);
-	u8 *src = addr;
-	mm_segment_t old_fs = get_fs();
+	long ret;
 
 	if (!maxlen)
 		return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 	 * Try to get string again, since the string can be changed while
 	 * probing.
 	 */
-	set_fs(KERNEL_DS);
-	pagefault_disable();
-
-	do
-		ret = __copy_from_user_inatomic(dst++, src++, 1);
-	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
-	dst[-1] = '\0';
-	pagefault_enable();
-	set_fs(old_fs);
+	ret = strncpy_from_unsafe(dst, addr, maxlen);
 
 	if (ret < 0) {	/* Failed to fetch string */
-		((u8 *)get_rloc_data(dest))[0] = '\0';
+		dst[0] = '\0';
 		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
 	} else {
-		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-					      get_rloc_offs(*(u32 *)dest));
+		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
 	}
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	cred->cap_inheritable = CAP_EMPTY_SET;
 	cred->cap_permitted = CAP_FULL_SET;
 	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_ambient = CAP_EMPTY_SET;
 	cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
 	key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
+#include <linux/kthread.h>
 
 /*
  * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
-void watchdog_nmi_enable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_enable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!watchdog_running)
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_disable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
 	.unpark			= watchdog_enable,
 };
 
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
 {
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
-	int ret;
+	int cpu, ret = 0;
 
+	get_online_cpus();
+	for_each_watchdog_cpu(cpu) {
+		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+		if (ret)
+			break;
+	}
+	if (ret) {
+		for_each_watchdog_cpu(cpu)
+			kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+	}
+	put_online_cpus();
+
+	return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	for_each_watchdog_cpu(cpu)
+		kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+	put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+	int ret = 0;
+
+	mutex_lock(&watchdog_proc_mutex);
 	/*
-	 * No need to cancel and restart hrtimer if it is currently executing
-	 * because it will reprogram itself with the new period now.
-	 * We should never see it unqueued here because we are running per-cpu
-	 * with interrupts disabled.
+	 * Multiple suspend requests can be active in parallel (counted by
+	 * the 'watchdog_suspended' variable). If the watchdog threads are
+	 * running, the first caller takes care that they will be parked.
+	 * The state of 'watchdog_running' cannot change while a suspend
+	 * request is active (see related code in 'proc' handlers).
 	 */
-	ret = hrtimer_try_to_cancel(hrtimer);
-	if (ret == 1)
-		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-				HRTIMER_MODE_REL_PINNED);
+	if (watchdog_running && !watchdog_suspended)
+		ret = watchdog_park_threads();
+
+	if (ret == 0)
+		watchdog_suspended++;
+
+	mutex_unlock(&watchdog_proc_mutex);
+
+	return ret;
 }
 
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
 {
+	mutex_lock(&watchdog_proc_mutex);
+
+	watchdog_suspended--;
 	/*
-	 * Make sure that perf event counter will adopt to a new
-	 * sampling period. Updating the sampling period directly would
-	 * be much nicer but we do not have an API for that now so
-	 * let's use a big hammer.
-	 * Hrtimer will adopt the new period on the next tick but this
-	 * might be late already so we have to restart the timer as well.
+	 * The watchdog threads are unparked if they were previously running
+	 * and if there is no more active suspend request.
 	 */
-	watchdog_nmi_disable(cpu);
-	smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
-	watchdog_nmi_enable(cpu);
+	if (watchdog_running && !watchdog_suspended)
+		watchdog_unpark_threads();
+
+	mutex_unlock(&watchdog_proc_mutex);
 }
 
 static void update_watchdog_all_cpus(void)
 {
-	int cpu;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		update_watchdog(cpu);
-	put_online_cpus();
+	watchdog_park_threads();
+	watchdog_unpark_threads();
 }
 
 static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
 	int err = 0;
 
 	if (!watchdog_running) {
-		err = smpboot_register_percpu_thread(&watchdog_threads);
+		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+							     &watchdog_cpumask);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else {
-			if (smpboot_update_cpumask_percpu_thread(
-				    &watchdog_threads, &watchdog_cpumask))
-				pr_err("Failed to set cpumask for watchdog threads\n");
+		else
 			watchdog_running = 1;
-		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	/*
 	 * If the parameter is being read return the state of the corresponding
 	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	int err;
 
 	mutex_lock(&watchdog_proc_mutex);
+
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
 		/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 				pr_err("cpumask update failed\n");
 		}
 	}
+out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
 
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled()) {
-		if (!cpumask_empty(tick_nohz_full_mask))
-			pr_info("Disabling watchdog on nohz_full cores by default\n");
-		cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
-			       tick_nohz_full_mask);
+		pr_info("Disabling watchdog on nohz_full cores by default\n");
+		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
 	} else
 		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #else