diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 137 | ||||
-rw-r--r-- | kernel/bpf/core.c | 9 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 54 | ||||
-rw-r--r-- | kernel/cgroup.c | 7 | ||||
-rw-r--r-- | kernel/events/core.c | 78 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/futex.c | 100 | ||||
-rw-r--r-- | kernel/jump_label.c | 158 | ||||
-rw-r--r-- | kernel/kthread.c | 7 | ||||
-rw-r--r-- | kernel/locking/Makefile | 1 | ||||
-rw-r--r-- | kernel/locking/qrwlock.c | 47 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 6 | ||||
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 102 | ||||
-rw-r--r-- | kernel/locking/rtmutex-tester.c | 420 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 2 | ||||
-rw-r--r-- | kernel/locking/rtmutex_common.h | 22 | ||||
-rw-r--r-- | kernel/sched/core.c | 6 | ||||
-rw-r--r-- | kernel/sched/wait.c | 7 | ||||
-rw-r--r-- | kernel/smpboot.c | 27 | ||||
-rw-r--r-- | kernel/sys_ni.c | 1 | ||||
-rw-r--r-- | kernel/task_work.c | 12 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 63 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 20 | ||||
-rw-r--r-- | kernel/user_namespace.c | 1 | ||||
-rw-r--r-- | kernel/watchdog.c | 189 |
26 files changed, 746 insertions, 735 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229a6fa4..29ace107f236 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) - BUG_ON(array->prog[i] != NULL); + BUG_ON(array->ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (!bpf_prog_array_compatible(array, prog)) { - bpf_prog_put(prog); - return -EINVAL; - } + new_ptr = map->ops->map_fd_get_ptr(map, ufd); + if (IS_ERR(new_ptr)) + return PTR_ERR(new_ptr); - old_prog = xchg(array->prog + index, prog); - if (old_prog) - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, new_ptr); + if (old_ptr) + map->ops->map_fd_put_ptr(old_ptr); return 0; } -static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *old_prog; + void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; - old_prog = xchg(array->prog + index, NULL); - if (old_prog) { - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, NULL); + if (old_ptr) { + map->ops->map_fd_put_ptr(old_ptr); return 0; } else { return -ENOENT; } } +static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) + return prog; + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + return prog; +} + +static void prog_fd_array_put_ptr(void *ptr) +{ + struct bpf_prog *prog = ptr; + + bpf_prog_put_rcu(prog); +} + /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_prog_array_map_clear(struct bpf_map *map) +void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - prog_array_map_delete_elem(map, &i); + fd_array_map_delete_elem(map, &i); } static const struct bpf_map_ops prog_array_ops = { - .map_alloc = prog_array_map_alloc, - .map_free = prog_array_map_free, + .map_alloc = fd_array_map_alloc, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, - .map_lookup_elem = prog_array_map_lookup_elem, - .map_update_elem = prog_array_map_update_elem, - .map_delete_elem = prog_array_map_delete_elem, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = prog_fd_array_get_ptr, + .map_fd_put_ptr = prog_fd_array_put_ptr, }; static struct bpf_map_type_list prog_array_type __read_mostly = { @@ -255,3 +273,60 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + return (void *)attr; + + if (attr->type != PERF_TYPE_RAW && + attr->type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = &perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(&perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c5bedc82bc1c..67c380cfa9ca 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } +EXPORT_SYMBOL_GPL(__bpf_call_base); /** * __bpf_prog_run - run eBPF program on a given context @@ -449,11 +450,15 @@ select_insn: tail_call_cnt++; - prog = READ_ONCE(array->prog[index]); + prog = READ_ONCE(array->ptrs[index]); if (unlikely(!prog)) goto out; - ARG1 = BPF_R1; + /* ARG1 at this point is guaranteed to point to CTX from + * the verifier side due to the fact that the tail call is + * handeled like a helper, that is, bpf_tail_call_proto, + * where arg1_type is ARG_PTR_TO_CTX. + */ insn = prog->insnsi; goto select_insn; out: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1b14d197a4f..dc9b464fefa9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) /* prog_array stores refcnt-ed bpf_prog pointers * release them all when user space closes prog_array_fd */ - bpf_prog_array_map_clear(map); + bpf_fd_array_map_clear(map); bpf_map_put(map); return 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866fd36a..ed12e385fb75 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, struct verifier_state *state = &env->cur_state; int size, err = 0; + if (state->regs[regno].type == PTR_TO_STACK) + off += state->regs[regno].imm; + size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; @@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == FRAME_PTR) { + } else if (state->regs[regno].type == FRAME_PTR || + state->regs[regno].type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i < ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: - * only allow to pass it into bpf_tail_call() for now. - * bpf_map_delete_elem() can be allowed in the future, - * while bpf_map_update_elem() must only be done via syscall - */ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call && - map->map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into - * bpf_tail_call() - */ - return -EINVAL; + err = check_map_func_compatibility(map, func_id); + if (err) + return err; return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f3f5cd5e2c0d..a8538e443784 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq, if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) - seq_printf(seq, ",%s", ss->legacy_name); + seq_show_option(seq, ss->name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq, spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index ae16867670a9..e8183895691c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3222,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) +{ + unsigned long flags; + u64 val; + + /* + * Disabling interrupts avoids all counter scheduling (context + * switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()); + + /* + * It must not be an event with inherit set, we cannot read + * all child counters from atomic context. + */ + WARN_ON_ONCE(event->attr.inherit); + + /* + * It must not have a pmu::count method, those are not + * NMI safe. + */ + WARN_ON_ONCE(event->pmu->count); + + /* + * If the event is currently on this CPU, its either a per-task event, + * or local to this CPU. Furthermore it means its ACTIVE (otherwise + * oncpu == -1). + */ + if (event->oncpu == smp_processor_id()) + event->pmu->read(event); + + val = local64_read(&event->count); + local_irq_restore(flags); + + return val; +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -8718,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, &f); + if (err) + return ERR_PTR(err); + + event = f.file->private_data; + atomic_long_inc(&event->refcount); + fdput(f); + + return event; +} + +const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + if (!event) + return ERR_PTR(-EINVAL); + + return &event->attr; +} + /* * inherit a event from parent task to child task: */ diff --git a/kernel/fork.c b/kernel/fork.c index 03aa2e6de7a4..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; + tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); diff --git a/kernel/futex.c b/kernel/futex.c index c4a182f5357e..6e443efc65f4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -64,6 +64,7 @@ #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> +#include <linux/fault-inject.h> #include <asm/futex.h> @@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize; static struct futex_hash_bucket *futex_queues; +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + u32 ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = 0, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ + static inline void futex_get_mm(union futex_key *key) { atomic_inc(&key->private.mm->mm_count); @@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try @@ -516,7 +584,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (ro) { + if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } @@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { u32 uninitialized_var(curval); + if (unlikely(should_fail_futex(true))) + return -EFAULT; + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; @@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Detect deadlocks. */ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + /* * Lookup existing state first. If it exists, try to attach to * its pi_state. @@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) @@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, @@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart) /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. */ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) @@ -2300,6 +2386,10 @@ retry_private: ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ switch (ret) { case 1: /* We got the lock. */ @@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. + * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all @@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52ebaca1b9fc..f7dd15d537f9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct static_key *key, int enable); +static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { @@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_ENABLE); - else - jump_label_update(key, JUMP_LABEL_DISABLE); - } - atomic_inc(&key->enabled); + if (atomic_inc_return(&key->enabled) == 1) + jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); } else { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_DISABLE); - else - jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_update(key); } jump_label_unlock(); } @@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* +/* * Update code which is definitely not currently executing. * Architectures which need heavyweight synchronization to modify * running code can override this to make the non-live update case @@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - arch_jump_label_transform(entry, type); + arch_jump_label_transform(entry, type); +} + +static inline struct jump_entry *static_key_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); +} + +static inline bool static_key_type(struct static_key *key) +{ + return (unsigned long)key->entries & JUMP_TYPE_MASK; +} + +static inline struct static_key *jump_entry_key(struct jump_entry *entry) +{ + return (struct static_key *)((unsigned long)entry->key & ~1UL); +} + +static bool jump_entry_branch(struct jump_entry *entry) +{ + return (unsigned long)entry->key & 1UL; +} + +static enum jump_label_type jump_label_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool enabled = static_key_enabled(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return enabled ^ branch; } static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop, int enable) + struct jump_entry *stop) { - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel * init code, see jump_label_invalidate_module_init(). */ if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); + arch_jump_label_transform(entry, jump_label_type(entry)); } } -static enum jump_label_type jump_label_type(struct static_key *key) -{ - bool true_branch = jump_label_get_branch_default(key); - bool state = static_key_enabled(key); - - if ((!true_branch && state) || (true_branch && !state)) - return JUMP_LABEL_ENABLE; - - return JUMP_LABEL_DISABLE; -} - void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -202,8 +211,11 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_type(iterk)); + /* rewrite NOPs */ + if (jump_label_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -222,6 +234,16 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES +static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool type = static_key_type(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return type ^ branch; +} + struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; @@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct static_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key) { - struct static_key_mod *mod = key->next; + struct static_key_mod *mod; - while (mod) { + for (mod = key->next; mod; mod = mod->next) { struct module *m = mod->mod; __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; + m->jump_entries + m->num_jump_entries); } } @@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + /* Only write NOPs for arch_branch_static(). */ + if (jump_label_init_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); } } @@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) jlm->next = key->next; key->next = jlm; - if (jump_label_type(key) == JUMP_LABEL_ENABLE) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + /* Only update if we've changed from our initial state */ + if (jump_label_type(iter) != jump_label_init_type(iter)) + __jump_label_update(key, iter, iter_stop); } return 0; @@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod) struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) + if (jump_entry_key(iter) == key) continue; - key = (struct static_key *)(unsigned long)iter->key; + key = jump_entry_key(iter); if (within_module(iter->key, mod)) continue; @@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct static_key *key, int enable) +static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = jump_label_get_entries(key); + struct jump_entry *entry = static_key_entries(key); #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key, enable); + __jump_label_mod_update(key); preempt_disable(); mod = __module_address((unsigned long)key); @@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable) #endif /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, enable); + __jump_label_update(key, entry, stop); } -#endif +#ifdef CONFIG_STATIC_KEYS_SELFTEST +static DEFINE_STATIC_KEY_TRUE(sk_true); +static DEFINE_STATIC_KEY_FALSE(sk_false); + +static __init int jump_label_test(void) +{ + int i; + + for (i = 0; i < 2; i++) { + WARN_ON(static_key_enabled(&sk_true.key) != true); + WARN_ON(static_key_enabled(&sk_false.key) != false); + + WARN_ON(!static_branch_likely(&sk_true)); + WARN_ON(!static_branch_unlikely(&sk_true)); + WARN_ON(static_branch_likely(&sk_false)); + WARN_ON(static_branch_unlikely(&sk_false)); + + static_branch_disable(&sk_true); + static_branch_enable(&sk_false); + + WARN_ON(static_key_enabled(&sk_true.key) == true); + WARN_ON(static_key_enabled(&sk_false.key) == false); + + WARN_ON(static_branch_likely(&sk_true)); + WARN_ON(static_branch_unlikely(&sk_true)); + WARN_ON(!static_branch_likely(&sk_false)); + WARN_ON(!static_branch_unlikely(&sk_false)); + + static_branch_enable(&sk_true); + static_branch_disable(&sk_false); + } + + return 0; +} +late_initcall(jump_label_test); +#endif /* STATIC_KEYS_SELFTEST */ + +#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 490924cc9e7c..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. - * @node: memory node number. + * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which no one will call kthread_stop(), or diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..36942047ffc0 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..f17a3e3b3550 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } @@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* @@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock) */ arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; @@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) struct __qrwlock *l = (struct __qrwlock *)lock; if (!READ_ONCE(l->wmode) && - (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); @@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock) unlock: arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..337c8818541d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_kick_node(lock, next); release: /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index df19ae4debd0..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,9 +22,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. * - * This should be fine however, kicking people for no reason is - * harmless. + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. * - * See the comment in pv_wait_node(). + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct qspinlock **lp = NULL; int loop; + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + for (;;) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) @@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); if (!lp) { /* ONCE */ + WRITE_ONCE(pn->state, vcpu_hashed); lp = pv_hash(lock, pn); + /* - * lp must be set before setting _Q_SLOW_VAL + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. * - * [S] lp = lock [RmW] l = l->locked = 0 - * MB MB - * [S] l->locked = _Q_SLOW_VAL [L] lp + * [S] pn->state + * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> + * [L] pn->state * - * Matches the cmpxchg() in __pv_queued_spin_unlock(). + * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { /* @@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + u8 locked; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(lockval == _Q_LOCKED_VAL)) + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) return; - if (unlikely(lockval != _Q_SLOW_VAL)) { - if (debug_locks_silent) - return; - WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); return; } /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); + + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. */ @@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) + if (READ_ONCE(node->state) == vcpu_hashed) pv_kick(node->cpu); } /* diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/device.h> -#include <linux/kthread.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/freezer.h> -#include <linux/stat.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..7781d801212f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); raw_spin_lock(&lock->wait_lock); set_current_state(state); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,28 +15,6 @@ #include <linux/rtmutex.h> /* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8420c233ff7..3595403921bd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..272d9322bc5d 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, + void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(q, mode, nr, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); + __wake_up_locked_key(q, mode, 1, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..a818cbc73e14 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); preempt_enable(); - if (ht->cleanup) + /* cleanup must mirror setup */ + if (ht->cleanup && td->status != HP_THREAD_NONE) ht->cleanup(td->cpu, cpu_online(td->cpu)); kfree(td); return 0; @@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; - /* Unpark any threads that were voluntarily parked. */ - for_each_cpu_not(cpu, ht->cpumask) { - if (cpu_online(cpu)) { - struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) - kthread_unpark(tsk); - } - } - /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * to hotplug * @plug_thread: Hotplug thread descriptor + * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, + const struct cpumask *cpumask) { unsigned int cpu; int ret = 0; if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + cpumask_copy(plug_thread->cpumask, cpumask); get_online_cpus(); mutex_lock(&smpboot_threads_lock); @@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); + free_cpumask_var(plug_thread->cpumask); goto out; } - smpboot_unpark_thread(plug_thread, cpu); + if (cpumask_test_cpu(cpu, cpumask)) + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -311,7 +308,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ca7d84f438f1..03c3875d9958 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/kernel/task_work.c b/kernel/task_work.c index 8727032e3a6f..53fa971d000d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * + * Note: there is no ordering guarantee on works queued here. + * * RETURNS: * 0 if succeeds or -ESRCH. */ @@ -108,16 +110,6 @@ void task_work_run(void) raw_spin_unlock_wait(&task->pi_lock); smp_mb(); - /* Reverse the list to run the works in fifo order */ - head = NULL; - do { - next = work->next; - work->next = head; - head = work; - work = next; - } while (work); - - work = head; do { next = work->next; work->func(work); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..0fe96c7c8803 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = { /* * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) { char *fmt = (char *) (long) r1; + bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; + u64 unsafe_addr; + char buf[64]; int i; /* @@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p') { + } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; i++; if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) return -EINVAL; fmt_cnt++; + if (fmt[i - 1] == 's') { + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + switch (fmt_cnt) { + case 1: + unsafe_addr = r3; + r3 = (long) buf; + break; + case 2: + unsafe_addr = r4; + r4 = (long) buf; + break; + case 3: + unsafe_addr = r5; + r5 = (long) buf; + break; + } + buf[0] = 0; + strncpy_from_unsafe(buf, + (void *) (long) unsafe_addr, + sizeof(buf)); + } continue; } @@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; default: return NULL; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f65a0a06a8c0..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> +#include <linux/kthread.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static void restart_watchdog_hrtimer(void *info) +/* + * park all watchdog threads that are specified in 'watchdog_cpumask' + */ +static int watchdog_park_threads(void) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); - int ret; + int cpu, ret = 0; + get_online_cpus(); + for_each_watchdog_cpu(cpu) { + ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (ret) + break; + } + if (ret) { + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } + put_online_cpus(); + + return ret; +} + +/* + * unpark all watchdog threads that are specified in 'watchdog_cpumask' + */ +static void watchdog_unpark_threads(void) +{ + int cpu; + + get_online_cpus(); + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + put_online_cpus(); +} + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + mutex_lock(&watchdog_proc_mutex); /* - * No need to cancel and restart hrtimer if it is currently executing - * because it will reprogram itself with the new period now. - * We should never see it unqueued here because we are running per-cpu - * with interrupts disabled. + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). */ - ret = hrtimer_try_to_cancel(hrtimer); - if (ret == 1) - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + + mutex_unlock(&watchdog_proc_mutex); + + return ret; } -static void update_watchdog(int cpu) +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) { + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; /* - * Make sure that perf event counter will adopt to a new - * sampling period. Updating the sampling period directly would - * be much nicer but we do not have an API for that now so - * let's use a big hammer. - * Hrtimer will adopt the new period on the next tick but this - * might be late already so we have to restart the timer as well. + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. */ - watchdog_nmi_disable(cpu); - smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); - watchdog_nmi_enable(cpu); + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); } static void update_watchdog_all_cpus(void) { - int cpu; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - update_watchdog(cpu); - put_online_cpus(); + watchdog_park_threads(); + watchdog_unpark_threads(); } static int watchdog_enable_all_cpus(void) @@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void) int err = 0; if (!watchdog_running) { - err = smpboot_register_percpu_thread(&watchdog_threads); + err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_cpumask); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else { - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask)) - pr_err("Failed to set cpumask for watchdog threads\n"); + else watchdog_running = 1; - } } else { /* * Enable/disable the lockup detectors or @@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; mutex_lock(&watchdog_proc_mutex); + + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, pr_err("cpumask update failed\n"); } } +out: mutex_unlock(&watchdog_proc_mutex); return err; } @@ -932,10 +975,8 @@ void __init lockup_detector_init(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { - if (!cpumask_empty(tick_nohz_full_mask)) - pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, - tick_nohz_full_mask); + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_copy(&watchdog_cpumask, housekeeping_mask); } else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #else |