diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/btf.c | 2 | ||||
-rw-r--r-- | kernel/bpf/local_storage.c | 5 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 155 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 12 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 25 | ||||
-rw-r--r-- | kernel/cpu.c | 11 | ||||
-rw-r--r-- | kernel/dma/Kconfig | 3 | ||||
-rw-r--r-- | kernel/dma/direct.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 32 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 13 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/jump_label.c | 2 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 1 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 3 | ||||
-rw-r--r-- | kernel/locking/test-ww_mutex.c | 12 | ||||
-rw-r--r-- | kernel/pid.c | 2 | ||||
-rw-r--r-- | kernel/power/suspend.c | 6 | ||||
-rw-r--r-- | kernel/printk/printk.c | 12 | ||||
-rw-r--r-- | kernel/printk/printk_safe.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/debug.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 130 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 | ||||
-rw-r--r-- | kernel/sched/topology.c | 5 | ||||
-rw-r--r-- | kernel/sys.c | 3 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 40 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 2 |
28 files changed, 345 insertions, 155 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -236,7 +240,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +252,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -305,39 +309,21 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -369,7 +355,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); @@ -570,14 +592,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +618,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +628,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +742,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +776,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +842,7 @@ more_data: case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +865,17 @@ more_data: lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -912,6 +933,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +945,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { @@ -1122,7 +1142,7 @@ wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1464,10 +1484,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1581,13 +1607,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -1894,6 +1920,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; @@ -2089,8 +2119,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2445,6 +2479,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2535,10 +2579,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2557,6 +2613,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); @@ -3163,7 +3171,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae10baf1902..4a3dae2a8283 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2836,11 +2836,12 @@ restart: } /** - * cgroup_save_control - save control masks of a subtree + * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Save ->subtree_control and ->subtree_ss_mask to the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { @@ -2850,6 +2851,7 @@ static void cgroup_save_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + dsct->old_dom_cgrp = dsct->dom_cgrp; } } @@ -2875,11 +2877,12 @@ static void cgroup_propagate_control(struct cgroup *cgrp) } /** - * cgroup_restore_control - restore control masks of a subtree + * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { @@ -2889,6 +2892,7 @@ static void cgroup_restore_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + dsct->dom_cgrp = dsct->old_dom_cgrp; } } @@ -3196,6 +3200,8 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); @@ -3225,12 +3231,13 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) */ cgroup_save_control(cgrp); - cgrp->dom_cgrp = dom_cgrp; + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) + if (dsct == cgrp || cgroup_is_threaded(dsct)) + dsct->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; - else - cgrp->dom_cgrp = cgrp; cgroup_finalize_control(cgrp, ret); return ret; diff --git a/kernel/cpu.c b/kernel/cpu.c index aa7fe85ad62e..0097acec1c71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -607,15 +607,15 @@ static void cpuhp_thread_fun(unsigned int cpu) bool bringup = st->bringup; enum cpuhp_state state; + if (WARN_ON_ONCE(!st->should_run)) + return; + /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (WARN_ON_ONCE(!st->should_run)) - return; - cpuhp_lock_acquire(bringup); if (st->single) { @@ -916,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st); + if (st->state < prev_state) + undo_cpu_down(cpu, st); break; } } @@ -969,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); - if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) { cpuhp_reset_state(st, prev_state); __cpuhp_kick_ap(st); } diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9bd54304446f..1b1d63b3634b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -23,6 +23,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU bool select NEED_DMA_MAP_STATE +config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL + bool + config DMA_DIRECT_OPS bool depends on HAS_DMA diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1c35b7b945d0..de87b0282e74 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -168,7 +168,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, int dma_direct_supported(struct device *dev, u64 mask) { #ifdef CONFIG_ZONE_DMA - if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))) return 0; #else /* @@ -177,7 +177,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * memory, or by providing a ZONE_DMA32. If neither is the case, the * architecture needs to use an IOMMU instead of the direct mapping. */ - if (mask < DMA_BIT_MASK(32)) + if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) return 0; #endif /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 2a62b96600ad..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); - if (err) { - if (!bp->attr.disabled) - _perf_event_enable(bp); - return err; - } - - if (!attr->disabled) + if (!bp->attr.disabled) _perf_event_enable(bp); - return 0; + + return err; } static int perf_event_modify_attr(struct perf_event *event, @@ -3940,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event->oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise @@ -5948,6 +5949,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; + mm_segment_t fs; /* * We dump: @@ -5965,7 +5967,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); + fs = get_fs(); + set_fs(USER_DS); rem = __output_copy_user(handle, (void *) sp, dump_size); + set_fs(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -8309,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) @@ -9426,9 +9433,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9684,12 +9689,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9701,13 +9702,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b3814fce5ecb..d6b56180827c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { + int err; + /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - if (!attr->disabled) { - int err = modify_user_hw_breakpoint_check(bp, attr, false); + err = modify_user_hw_breakpoint_check(bp, attr, false); - if (err) - return err; + if (!bp->attr.disabled) perf_event_enable(bp); - bp->attr.disabled = 0; - } - return 0; + + return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/fork.c b/kernel/fork.c index d896e9ca38b0..f0b58479534f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -550,8 +550,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01ebdf1f9f40..2e62503bea0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { - WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); + WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e406c5fdb41e..dd13f865ad40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,7 +55,6 @@ #include "lockdep_internals.h" -#include <trace/events/preemptirq.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 1a81a1257b3f..3f8a35104285 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock, /* * wake_up_process() paired with set_current_state() * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_lock_check_stamp() or has a + * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ if (owner != current) @@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, current); lock_contended(&lock->dep_map, ip); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5b915b370d5a..65a3b7e55b9f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; ww_acquire_init(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); @@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) @@ -324,7 +326,7 @@ static int __test_cycle(unsigned int nthreads) if (!cycle->result) continue; - pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n", n, nthreads, cycle->result); ret = -EINVAL; break; diff --git a/kernel/pid.c b/kernel/pid.c index de1cfc4f75a2..cdf63e53a014 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) idr_preload_end(); if (nr < 0) { - retval = nr; + retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5342f6fc022e..0bd595a0b610 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -63,6 +63,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +bool pm_suspend_via_s2idle(void) +{ + return mem_sleep_current == PM_SUSPEND_TO_IDLE; +} +EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); + void s2idle_set_ops(const struct platform_s2idle_ops *ops) { lock_system_sleep(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fd6f8ed28e01..9bf5404397e0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,7 +351,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1881,9 +1880,6 @@ int vprintk_store(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (suppress_message_printing(level)) - lflags |= LOG_NOCONS; - return log_output(facility, level, lflags, dict, dictlen, text, text_len); } @@ -2032,6 +2028,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2368,10 +2365,11 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + if (suppress_message_printing(msg->level)) { /* - * Skip record if !ignore_loglevel, and - * record has level above the console loglevel. + * Skip record we have buffered and already printed + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index a0a74c533e4b..0913b4d385de 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -306,12 +306,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void printk_nmi_enter(void) +void notrace printk_nmi_enter(void) { this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } -void printk_nmi_exit(void) +void notrace printk_nmi_exit(void) { this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..ad97f3ba5ec5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ out: return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 60caf1fb94e0..6383aa6a60ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - static_key_disable(&sched_feat_keys[i]); + static_key_disable_cpuslocked(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - static_key_enable(&sched_feat_keys[i]); + static_key_enable_cpuslocked(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); + cpus_read_lock(); inode_lock(inode); ret = sched_feat_set(cmp); inode_unlock(inode); + cpus_read_unlock(); if (ret < 0) return ret; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..7fc4a371bdd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; @@ -1514,6 +1524,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1553,6 +1578,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, } /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1625,16 +1660,22 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* * In the overloaded case, try and keep the load balanced. */ load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -3362,6 +3438,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach + * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. @@ -6274,7 +6351,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6327,6 +6404,8 @@ static void migrate_task_rq_fair(struct task_struct *p) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -7263,6 +7342,7 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; + const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7299,8 +7379,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7365,13 +7447,16 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; + const struct sched_class *curr_class; struct rq_flags rf; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7482,10 +7567,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -7507,7 +7592,7 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(cpu); + unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); @@ -8269,7 +8354,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); - return sds.busiest; + return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; @@ -9638,7 +9723,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..455fa330de04 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -783,6 +783,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; @@ -1523,7 +1524,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 56a0fed30c0a..505a41c42b96 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (sched_domains_numa_levels <= 1) { + if (sched_domains_numa_levels <= 2) { sched_numa_topology_type = NUMA_DIRECT; return; } @@ -1380,9 +1380,6 @@ void sched_init_numa(void) break; } - if (!level) - return; - /* * 'level' contains the number of unique distances * diff --git a/kernel/sys.c b/kernel/sys.c index cf5c67533ff1..123bd73046ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -71,9 +71,6 @@ #include <asm/io.h> #include <asm/unistd.h> -/* Hardening for Spectre-v1 */ -#include <linux/nospec.h> - #include "uid16.h" #ifndef SET_UNALIGN_CTL diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(&watchdog_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* - * If the clocksource is registered clocksource_watchdog_work() will + * If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(&cs->list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs: clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(&clocksource_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int __clocksource_watchdog_work(void) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } @@ -810,7 +830,7 @@ static int __init clocksource_done_booting(void) /* * Run the watchdog first to eliminate unstable clock sources */ - __clocksource_watchdog_work(); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d92d4a982fd..65bd4616220d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + cond_resched(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, &tmp_iter_page); |