diff options
Diffstat (limited to 'kernel')
39 files changed, 469 insertions, 264 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b1f66480135b..14750e7c5ee4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -26,8 +26,10 @@ static void bpf_array_free_percpu(struct bpf_array *array)  {  	int i; -	for (i = 0; i < array->map.max_entries; i++) +	for (i = 0; i < array->map.max_entries; i++) {  		free_percpu(array->pptrs[i]); +		cond_resched(); +	}  }  static int bpf_array_alloc_percpu(struct bpf_array *array) @@ -43,6 +45,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)  			return -ENOMEM;  		}  		array->pptrs[i] = ptr; +		cond_resched();  	}  	return 0; @@ -73,11 +76,11 @@ static int array_map_alloc_check(union bpf_attr *attr)  static struct bpf_map *array_map_alloc(union bpf_attr *attr)  {  	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; -	int numa_node = bpf_map_attr_numa_node(attr); +	int ret, numa_node = bpf_map_attr_numa_node(attr);  	u32 elem_size, index_mask, max_entries;  	bool unpriv = !capable(CAP_SYS_ADMIN); +	u64 cost, array_size, mask64;  	struct bpf_array *array; -	u64 array_size, mask64;  	elem_size = round_up(attr->value_size, 8); @@ -109,8 +112,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  		array_size += (u64) max_entries * elem_size;  	/* make sure there is no u32 overflow later in round_up() */ -	if (array_size >= U32_MAX - PAGE_SIZE) +	cost = array_size; +	if (cost >= U32_MAX - PAGE_SIZE)  		return ERR_PTR(-ENOMEM); +	if (percpu) { +		cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); +		if (cost >= U32_MAX - PAGE_SIZE) +			return ERR_PTR(-ENOMEM); +	} +	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + +	ret = bpf_map_precharge_memlock(cost); +	if (ret < 0) +		return ERR_PTR(ret);  	/* allocate all map elements and zero-initialize them */  	array = bpf_map_area_alloc(array_size, numa_node); @@ -121,20 +135,13 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	/* copy mandatory map attributes */  	bpf_map_init_from_attr(&array->map, attr); +	array->map.pages = cost;  	array->elem_size = elem_size; -	if (!percpu) -		goto out; - -	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); - -	if (array_size >= U32_MAX - PAGE_SIZE || -	    bpf_array_alloc_percpu(array)) { +	if (percpu && bpf_array_alloc_percpu(array)) {  		bpf_map_area_free(array);  		return ERR_PTR(-ENOMEM);  	} -out: -	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;  	return &array->map;  } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29ca9208dcfa..d315b393abdd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1590,7 +1590,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,  	 * so always copy 'cnt' prog_ids to the user.  	 * In a rare race the user will see zero prog_ids  	 */ -	ids = kcalloc(cnt, sizeof(u32), GFP_USER); +	ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);  	if (!ids)  		return -ENOMEM;  	rcu_read_lock(); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbfdada6caee..a4bb0b34375a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -334,7 +334,7 @@ static int cpu_map_kthread_run(void *data)  static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,  						       int map_id)  { -	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; +	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;  	struct bpf_cpu_map_entry *rcpu;  	int numa, err; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 7b469d10d0e9..b4b5b81e7251 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -555,7 +555,10 @@ static void trie_free(struct bpf_map *map)  	struct lpm_trie_node __rcu **slot;  	struct lpm_trie_node *node; -	raw_spin_lock(&trie->lock); +	/* Wait for outstanding programs to complete +	 * update/lookup/delete/get_next_key and free the trie. +	 */ +	synchronize_rcu();  	/* Always start at the root and walk down to a node that has no  	 * children. Then free that node, nullify its reference in the parent @@ -566,10 +569,9 @@ static void trie_free(struct bpf_map *map)  		slot = &trie->root;  		for (;;) { -			node = rcu_dereference_protected(*slot, -					lockdep_is_held(&trie->lock)); +			node = rcu_dereference_protected(*slot, 1);  			if (!node) -				goto unlock; +				goto out;  			if (rcu_access_pointer(node->child[0])) {  				slot = &node->child[0]; @@ -587,8 +589,8 @@ static void trie_free(struct bpf_map *map)  		}  	} -unlock: -	raw_spin_unlock(&trie->lock); +out: +	kfree(trie);  }  static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 48c33417d13c..a927e89dad6e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -521,8 +521,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock,  static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  {  	struct bpf_stab *stab; -	int err = -EINVAL;  	u64 cost; +	int err;  	if (!capable(CAP_NET_ADMIN))  		return ERR_PTR(-EPERM); @@ -547,6 +547,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  	/* make sure page count doesn't overflow */  	cost = (u64) stab->map.max_entries * sizeof(struct sock *); +	err = -EINVAL;  	if (cost >= U32_MAX - PAGE_SIZE)  		goto free_stab; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..43f95d190eea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	union bpf_attr attr = {};  	int err; -	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) +	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))  		return -EPERM;  	err = check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fb69a85d967..c6eff108aa99 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1356,6 +1356,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)  	return reg->type == PTR_TO_CTX;  } +static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) +{ +	const struct bpf_reg_state *reg = cur_regs(env) + regno; + +	return type_is_pkt_pointer(reg->type); +} +  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict) @@ -1416,10 +1423,10 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env,  }  static int check_ptr_alignment(struct bpf_verifier_env *env, -			       const struct bpf_reg_state *reg, -			       int off, int size) +			       const struct bpf_reg_state *reg, int off, +			       int size, bool strict_alignment_once)  { -	bool strict = env->strict_alignment; +	bool strict = env->strict_alignment || strict_alignment_once;  	const char *pointer_desc = "";  	switch (reg->type) { @@ -1576,9 +1583,9 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)   * if t==write && value_regno==-1, some unknown value is stored into memory   * if t==read && value_regno==-1, don't care what we read from memory   */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, -			    int bpf_size, enum bpf_access_type t, -			    int value_regno) +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +			    int off, int bpf_size, enum bpf_access_type t, +			    int value_regno, bool strict_alignment_once)  {  	struct bpf_reg_state *regs = cur_regs(env);  	struct bpf_reg_state *reg = regs + regno; @@ -1590,7 +1597,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		return size;  	/* alignment checks will add in reg->off themselves */ -	err = check_ptr_alignment(env, reg, off, size); +	err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);  	if (err)  		return err; @@ -1735,21 +1742,23 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  		return -EACCES;  	} -	if (is_ctx_reg(env, insn->dst_reg)) { -		verbose(env, "BPF_XADD stores into R%d context is not allowed\n", -			insn->dst_reg); +	if (is_ctx_reg(env, insn->dst_reg) || +	    is_pkt_reg(env, insn->dst_reg)) { +		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", +			insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? +			"context" : "packet");  		return -EACCES;  	}  	/* check whether atomic_add can read the memory */  	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, -			       BPF_SIZE(insn->code), BPF_READ, -1); +			       BPF_SIZE(insn->code), BPF_READ, -1, true);  	if (err)  		return err;  	/* check whether atomic_add can write into the same memory */  	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, -				BPF_SIZE(insn->code), BPF_WRITE, -1); +				BPF_SIZE(insn->code), BPF_WRITE, -1, true);  }  /* when register 'regno' is passed into function that will read 'access_size' @@ -2388,7 +2397,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  	 * is inferred from register state.  	 */  	for (i = 0; i < meta.access_size; i++) { -		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); +		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, +				       BPF_WRITE, -1, false);  		if (err)  			return err;  	} @@ -4632,7 +4642,7 @@ static int do_check(struct bpf_verifier_env *env)  			 */  			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,  					       BPF_SIZE(insn->code), BPF_READ, -					       insn->dst_reg); +					       insn->dst_reg, false);  			if (err)  				return err; @@ -4684,7 +4694,7 @@ static int do_check(struct bpf_verifier_env *env)  			/* check that memory (dst_reg + off) is writeable */  			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,  					       BPF_SIZE(insn->code), BPF_WRITE, -					       insn->src_reg); +					       insn->src_reg, false);  			if (err)  				return err; @@ -4719,7 +4729,7 @@ static int do_check(struct bpf_verifier_env *env)  			/* check that memory (dst_reg + off) is writeable */  			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,  					       BPF_SIZE(insn->code), BPF_WRITE, -					       -1); +					       -1, false);  			if (err)  				return err; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8cda3bc3ae22..4bfb2908ec15 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)  	if (cgroup_is_threaded(cgrp))  		return 0; +	/* +	 * If @cgroup is populated or has domain controllers enabled, it +	 * can't be switched.  While the below cgroup_can_be_thread_root() +	 * test can catch the same conditions, that's only when @parent is +	 * not mixable, so let's check it explicitly. +	 */ +	if (cgroup_is_populated(cgrp) || +	    cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) +		return -EOPNOTSUPP; +  	/* we're joining the parent's domain, ensure its validity */  	if (!cgroup_is_valid_domain(dom_cgrp) ||  	    !cgroup_can_be_thread_root(dom_cgrp)) diff --git a/kernel/compat.c b/kernel/compat.c index 3247fe761f60..3f5fa8902e7d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)  }  EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, -		  unsigned int size) -{ -	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN -	compat_sigset_t v; -	switch (_NSIG_WORDS) { -	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; -	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; -	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; -	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; -	} -	return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else -	return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} -  #ifdef CONFIG_NUMA  COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,  		       compat_uptr_t __user *, pages32, diff --git a/kernel/events/core.c b/kernel/events/core.c index 96db9ae5d5af..709a55b9ad97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)  { -	struct perf_cgroup *cgrp_out = cpuctx->cgrp; -	if (cgrp_out) -		__update_cgrp_time(cgrp_out); +	struct perf_cgroup *cgrp = cpuctx->cgrp; +	struct cgroup_subsys_state *css; + +	if (cgrp) { +		for (css = &cgrp->css; css; css = css->parent) { +			cgrp = container_of(css, struct perf_cgroup, css); +			__update_cgrp_time(cgrp); +		} +	}  }  static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,  {  	struct perf_cgroup *cgrp;  	struct perf_cgroup_info *info; +	struct cgroup_subsys_state *css;  	/*  	 * ctx->lock held by caller @@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,  		return;  	cgrp = perf_cgroup_from_task(task, ctx); -	info = this_cpu_ptr(cgrp->info); -	info->timestamp = ctx->timestamp; + +	for (css = &cgrp->css; css; css = css->parent) { +		cgrp = container_of(css, struct perf_cgroup, css); +		info = this_cpu_ptr(cgrp->info); +		info->timestamp = ctx->timestamp; +	}  }  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); @@ -2246,7 +2257,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,  			struct perf_event_context *task_ctx,  			enum event_type_t event_type)  { -	enum event_type_t ctx_event_type = event_type & EVENT_ALL; +	enum event_type_t ctx_event_type;  	bool cpu_event = !!(event_type & EVENT_CPU);  	/* @@ -2256,6 +2267,8 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,  	if (event_type & EVENT_PINNED)  		event_type |= EVENT_FLEXIBLE; +	ctx_event_type = event_type & EVENT_ALL; +  	perf_pmu_disable(cpuctx->ctx.pmu);  	if (task_ctx)  		task_ctx_sched_out(cpuctx, task_ctx, event_type); diff --git a/kernel/extable.c b/kernel/extable.c index a17fdb63dc3e..6a5b61ebc66c 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)  	return e;  } -static inline int init_kernel_text(unsigned long addr) +int init_kernel_text(unsigned long addr)  {  	if (addr >= (unsigned long)_sinittext &&  	    addr < (unsigned long)_einittext) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 21b0122cb39c..1d5632d8bbcc 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -14,6 +14,15 @@  static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); +static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs, +			     unsigned long flags) +{ +	/* +	 * A dummy post handler is required to prohibit optimizing, because +	 * jump optimization does not support execution path overriding. +	 */ +} +  struct fei_attr {  	struct list_head list;  	struct kprobe kp; @@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)  			return NULL;  		}  		attr->kp.pre_handler = fei_kprobe_handler; +		attr->kp.post_handler = fei_post_handler;  		attr->retval = adjust_error_retval(addr, 0);  		INIT_LIST_HEAD(&attr->list);  	} diff --git a/kernel/fork.c b/kernel/fork.c index be8aa5b98666..e5d9d405ae4e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -592,7 +592,7 @@ static void check_mm(struct mm_struct *mm)   * is dropped: either by a lazy thread or by   * mmput. Free the page directory and the mm.   */ -static void __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm)  {  	BUG_ON(mm == &init_mm);  	mm_free_pgd(mm); @@ -603,18 +603,7 @@ static void __mmdrop(struct mm_struct *mm)  	put_user_ns(mm->user_ns);  	free_mm(mm);  } - -void mmdrop(struct mm_struct *mm) -{ -	/* -	 * The implicit full barrier implied by atomic_dec_and_test() is -	 * required by the membarrier system call before returning to -	 * user-space, after storing to rq->curr. -	 */ -	if (unlikely(atomic_dec_and_test(&mm->mm_count))) -		__mmdrop(mm); -} -EXPORT_SYMBOL_GPL(mmdrop); +EXPORT_SYMBOL_GPL(__mmdrop);  static void mmdrop_async_fn(struct work_struct *work)  { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e6a9c36470ee..82b8b18ee1eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1726,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p)  	irq_domain_debug_show_one(m, d, 0);  	return 0;  } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ -	return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { -	.open		= irq_domain_debug_open, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);  static void debugfs_add_domain_dir(struct irq_domain *d)  {  	if (!d->name || !domain_dir || d->debugfs_file)  		return;  	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, -					      &dfs_domain_ops); +					      &irq_domain_debug_fops);  }  static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1760,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root)  	if (!domain_dir)  		return; -	debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); +	debugfs_create_file("default", 0444, domain_dir, NULL, +			    &irq_domain_debug_fops);  	mutex_lock(&irq_domain_mutex);  	list_for_each_entry(d, &irq_domain_list, link)  		debugfs_add_domain_dir(d); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5187dfe809ac..4c5770407031 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -16,6 +16,7 @@ struct cpumap {  	unsigned int		available;  	unsigned int		allocated;  	unsigned int		managed; +	bool			initialized;  	bool			online;  	unsigned long		alloc_map[IRQ_MATRIX_SIZE];  	unsigned long		managed_map[IRQ_MATRIX_SIZE]; @@ -81,9 +82,11 @@ void irq_matrix_online(struct irq_matrix *m)  	BUG_ON(cm->online); -	bitmap_zero(cm->alloc_map, m->matrix_bits); -	cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); -	cm->allocated = 0; +	if (!cm->initialized) { +		cm->available = m->alloc_size; +		cm->available -= cm->managed + m->systembits_inalloc; +		cm->initialized = true; +	}  	m->global_available += cm->available;  	cm->online = true;  	m->online_maps++; @@ -370,14 +373,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,  	if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))  		return; -	if (cm->online) { -		clear_bit(bit, cm->alloc_map); -		cm->allocated--; +	clear_bit(bit, cm->alloc_map); +	cm->allocated--; + +	if (cm->online)  		m->total_allocated--; -		if (!managed) { -			cm->available++; + +	if (!managed) { +		cm->available++; +		if (cm->online)  			m->global_available++; -		}  	}  	trace_irq_matrix_free(bit, cpu, m, cm);  } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b4517095db6a..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@  #include <linux/jump_label_ratelimit.h>  #include <linux/bug.h>  #include <linux/cpu.h> +#include <asm/sections.h>  #ifdef HAVE_JUMP_LABEL @@ -366,12 +367,16 @@ static void __jump_label_update(struct static_key *key,  {  	for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {  		/* -		 * entry->code set to 0 invalidates module init text sections -		 * kernel_text_address() verifies we are not in core kernel -		 * init code, see jump_label_invalidate_module_init(). +		 * An entry->code of 0 indicates an entry which has been +		 * disabled because it was in an init text area.  		 */ -		if (entry->code && kernel_text_address(entry->code)) -			arch_jump_label_transform(entry, jump_label_type(entry)); +		if (entry->code) { +			if (kernel_text_address(entry->code)) +				arch_jump_label_transform(entry, jump_label_type(entry)); +			else +				WARN_ONCE(1, "can't patch jump_label at %pS", +					  (void *)(unsigned long)entry->code); +		}  	}  } @@ -417,6 +422,19 @@ void __init jump_label_init(void)  	cpus_read_unlock();  } +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) +{ +	struct jump_entry *iter_start = __start___jump_table; +	struct jump_entry *iter_stop = __stop___jump_table; +	struct jump_entry *iter; + +	for (iter = iter_start; iter < iter_stop; iter++) { +		if (init_section_contains((void *)(unsigned long)iter->code, 1)) +			iter->code = 0; +	} +} +  #ifdef CONFIG_MODULES  static enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -633,6 +651,7 @@ static void jump_label_del_module(struct module *mod)  	}  } +/* Disable any jump label entries in module init code */  static void jump_label_invalidate_module_init(struct module *mod)  {  	struct jump_entry *iter_start = mod->jump_entries; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,67 +978,90 @@ static int prepare_kprobe(struct kprobe *p)  }  /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p)  { -	int ret; +	int ret = 0;  	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,  				   (unsigned long)p->addr, 0, 0); -	WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); -	kprobe_ftrace_enabled++; -	if (kprobe_ftrace_enabled == 1) { +	if (ret) { +		pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); +		return ret; +	} + +	if (kprobe_ftrace_enabled == 0) {  		ret = register_ftrace_function(&kprobe_ftrace_ops); -		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); +		if (ret) { +			pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); +			goto err_ftrace; +		}  	} + +	kprobe_ftrace_enabled++; +	return ret; + +err_ftrace: +	/* +	 * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a +	 * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental +	 * empty filter_hash which would undesirably trace all functions. +	 */ +	ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); +	return ret;  }  /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p)  { -	int ret; +	int ret = 0; -	kprobe_ftrace_enabled--; -	if (kprobe_ftrace_enabled == 0) { +	if (kprobe_ftrace_enabled == 1) {  		ret = unregister_ftrace_function(&kprobe_ftrace_ops); -		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); +		if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) +			return ret;  	} + +	kprobe_ftrace_enabled--; +  	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,  			   (unsigned long)p->addr, 1, 0);  	WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); +	return ret;  }  #else	/* !CONFIG_KPROBES_ON_FTRACE */  #define prepare_kprobe(p)	arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p)	do {} while (0) -#define disarm_kprobe_ftrace(p)	do {} while (0) +#define arm_kprobe_ftrace(p)	(-ENODEV) +#define disarm_kprobe_ftrace(p)	(-ENODEV)  #endif  /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp)  { -	if (unlikely(kprobe_ftrace(kp))) { -		arm_kprobe_ftrace(kp); -		return; -	} +	if (unlikely(kprobe_ftrace(kp))) +		return arm_kprobe_ftrace(kp); +  	cpus_read_lock();  	mutex_lock(&text_mutex);  	__arm_kprobe(kp);  	mutex_unlock(&text_mutex);  	cpus_read_unlock(); + +	return 0;  }  /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt)  { -	if (unlikely(kprobe_ftrace(kp))) { -		disarm_kprobe_ftrace(kp); -		return; -	} +	if (unlikely(kprobe_ftrace(kp))) +		return disarm_kprobe_ftrace(kp);  	cpus_read_lock();  	mutex_lock(&text_mutex);  	__disarm_kprobe(kp, reopt);  	mutex_unlock(&text_mutex);  	cpus_read_unlock(); + +	return 0;  }  /* @@ -1362,9 +1385,15 @@ out:  	if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {  		ap->flags &= ~KPROBE_FLAG_DISABLED; -		if (!kprobes_all_disarmed) +		if (!kprobes_all_disarmed) {  			/* Arm the breakpoint again. */ -			arm_kprobe(ap); +			ret = arm_kprobe(ap); +			if (ret) { +				ap->flags |= KPROBE_FLAG_DISABLED; +				list_del_rcu(&p->list); +				synchronize_sched(); +			} +		}  	}  	return ret;  } @@ -1573,8 +1602,14 @@ int register_kprobe(struct kprobe *p)  	hlist_add_head_rcu(&p->hlist,  		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); -	if (!kprobes_all_disarmed && !kprobe_disabled(p)) -		arm_kprobe(p); +	if (!kprobes_all_disarmed && !kprobe_disabled(p)) { +		ret = arm_kprobe(p); +		if (ret) { +			hlist_del_rcu(&p->hlist); +			synchronize_sched(); +			goto out; +		} +	}  	/* Try to optimize kprobe */  	try_to_optimize_kprobe(p); @@ -1608,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap)  static struct kprobe *__disable_kprobe(struct kprobe *p)  {  	struct kprobe *orig_p; +	int ret;  	/* Get an original kprobe for return */  	orig_p = __get_valid_kprobe(p);  	if (unlikely(orig_p == NULL)) -		return NULL; +		return ERR_PTR(-EINVAL);  	if (!kprobe_disabled(p)) {  		/* Disable probe if it is a child probe */ @@ -1626,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)  			 * should have already been disarmed, so  			 * skip unneed disarming process.  			 */ -			if (!kprobes_all_disarmed) -				disarm_kprobe(orig_p, true); +			if (!kprobes_all_disarmed) { +				ret = disarm_kprobe(orig_p, true); +				if (ret) { +					p->flags &= ~KPROBE_FLAG_DISABLED; +					return ERR_PTR(ret); +				} +			}  			orig_p->flags |= KPROBE_FLAG_DISABLED;  		}  	} @@ -1644,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p)  	/* Disable kprobe. This will disarm it if needed. */  	ap = __disable_kprobe(p); -	if (ap == NULL) -		return -EINVAL; +	if (IS_ERR(ap)) +		return PTR_ERR(ap);  	if (ap == p)  		/* @@ -2078,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p)  int disable_kprobe(struct kprobe *kp)  {  	int ret = 0; +	struct kprobe *p;  	mutex_lock(&kprobe_mutex);  	/* Disable this kprobe */ -	if (__disable_kprobe(kp) == NULL) -		ret = -EINVAL; +	p = __disable_kprobe(kp); +	if (IS_ERR(p)) +		ret = PTR_ERR(p);  	mutex_unlock(&kprobe_mutex);  	return ret; @@ -2116,7 +2159,9 @@ int enable_kprobe(struct kprobe *kp)  	if (!kprobes_all_disarmed && kprobe_disabled(p)) {  		p->flags &= ~KPROBE_FLAG_DISABLED; -		arm_kprobe(p); +		ret = arm_kprobe(p); +		if (ret) +			p->flags |= KPROBE_FLAG_DISABLED;  	}  out:  	mutex_unlock(&kprobe_mutex); @@ -2407,11 +2452,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {  	.release        = seq_release,  }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p; -	unsigned int i; +	unsigned int i, total = 0, errors = 0; +	int err, ret = 0;  	mutex_lock(&kprobe_mutex); @@ -2428,46 +2474,74 @@ static void arm_all_kprobes(void)  	/* Arming kprobes doesn't optimize kprobe itself */  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {  		head = &kprobe_table[i]; -		hlist_for_each_entry_rcu(p, head, hlist) -			if (!kprobe_disabled(p)) -				arm_kprobe(p); +		/* Arm all kprobes on a best-effort basis */ +		hlist_for_each_entry_rcu(p, head, hlist) { +			if (!kprobe_disabled(p)) { +				err = arm_kprobe(p); +				if (err)  { +					errors++; +					ret = err; +				} +				total++; +			} +		}  	} -	printk(KERN_INFO "Kprobes globally enabled\n"); +	if (errors) +		pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", +			errors, total); +	else +		pr_info("Kprobes globally enabled\n");  already_enabled:  	mutex_unlock(&kprobe_mutex); -	return; +	return ret;  } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p; -	unsigned int i; +	unsigned int i, total = 0, errors = 0; +	int err, ret = 0;  	mutex_lock(&kprobe_mutex);  	/* If kprobes are already disarmed, just return */  	if (kprobes_all_disarmed) {  		mutex_unlock(&kprobe_mutex); -		return; +		return 0;  	}  	kprobes_all_disarmed = true; -	printk(KERN_INFO "Kprobes globally disabled\n");  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {  		head = &kprobe_table[i]; +		/* Disarm all kprobes on a best-effort basis */  		hlist_for_each_entry_rcu(p, head, hlist) { -			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) -				disarm_kprobe(p, false); +			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { +				err = disarm_kprobe(p, false); +				if (err) { +					errors++; +					ret = err; +				} +				total++; +			}  		}  	} + +	if (errors) +		pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", +			errors, total); +	else +		pr_info("Kprobes globally disabled\n"); +  	mutex_unlock(&kprobe_mutex);  	/* Wait for disarming all kprobes by optimizer */  	wait_for_kprobe_optimizer(); + +	return ret;  }  /* @@ -2494,6 +2568,7 @@ static ssize_t write_enabled_file_bool(struct file *file,  {  	char buf[32];  	size_t buf_size; +	int ret = 0;  	buf_size = min(count, (sizeof(buf)-1));  	if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,17 +2579,20 @@ static ssize_t write_enabled_file_bool(struct file *file,  	case 'y':  	case 'Y':  	case '1': -		arm_all_kprobes(); +		ret = arm_all_kprobes();  		break;  	case 'n':  	case 'N':  	case '0': -		disarm_all_kprobes(); +		ret = disarm_all_kprobes();  		break;  	default:  		return -EINVAL;  	} +	if (ret) +		return ret; +  	return count;  } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2048359f33d2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1082,15 +1082,16 @@ static noinline int __sched  __mutex_lock_interruptible_slowpath(struct mutex *lock);  /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired.   * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock().  If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex.   * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived.   */  int __sched mutex_lock_interruptible(struct mutex *lock)  { @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)  EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock().  If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */  int __sched mutex_lock_killable(struct mutex *lock)  {  	might_sleep(); @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock)  }  EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock().  While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */  void __sched mutex_lock_io(struct mutex *lock)  {  	int token; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38ece035039e..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,6 +379,14 @@ queue:  	tail = encode_tail(smp_processor_id(), idx);  	node += idx; + +	/* +	 * Ensure that we increment the head node->count before initialising +	 * the actual node. If the compiler is kind enough to reorder these +	 * stores, then an IRQ could overwrite our assignments. +	 */ +	barrier(); +  	node->locked = 0;  	node->next = NULL;  	pv_init_node(node); @@ -408,14 +416,15 @@ queue:  	 */  	if (old & _Q_TAIL_MASK) {  		prev = decode_tail(old); +  		/* -		 * The above xchg_tail() is also a load of @lock which -		 * generates, through decode_tail(), a pointer.  The address -		 * dependency matches the RELEASE of xchg_tail() such that -		 * the subsequent access to @prev happens after. +		 * We must ensure that the stores to @node are observed before +		 * the write to prev->next. The address dependency from +		 * xchg_tail is not sufficient to ensure this because the read +		 * component of xchg_tail is unordered with respect to the +		 * initialisation of @node.  		 */ - -		WRITE_ONCE(prev->next, node); +		smp_store_release(&prev->next, node);  		pv_wait_node(node, prev);  		arch_mcs_spin_lock_contended(&node->locked); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 65cc0cb984e6..940633c63254 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1616,11 +1616,12 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,  void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)  {  	DEFINE_WAKE_Q(wake_q); +	unsigned long flags;  	bool postunlock; -	raw_spin_lock_irq(&lock->wait_lock); +	raw_spin_lock_irqsave(&lock->wait_lock, flags);  	postunlock = __rt_mutex_futex_unlock(lock, &wake_q); -	raw_spin_unlock_irq(&lock->wait_lock); +	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);  	if (postunlock)  		rt_mutex_postunlock(&wake_q); diff --git a/kernel/memremap.c b/kernel/memremap.c index 4849be5f9b3c..895e6b76b25e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -275,8 +275,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap)  	return (res->start + resource_size(res)) >> PAGE_SHIFT;  } +static unsigned long pfn_next(unsigned long pfn) +{ +	if (pfn % 1024 == 0) +		cond_resched(); +	return pfn + 1; +} +  #define for_each_device_pfn(pfn, map) \ -	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) +	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))  static void devm_memremap_pages_release(void *data)  { @@ -337,10 +344,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)  	resource_size_t align_start, align_size, align_end;  	struct vmem_altmap *altmap = pgmap->altmap_valid ?  			&pgmap->altmap : NULL; +	struct resource *res = &pgmap->res;  	unsigned long pfn, pgoff, order;  	pgprot_t pgprot = PAGE_KERNEL; -	int error, nid, is_ram, i = 0; -	struct resource *res = &pgmap->res; +	int error, nid, is_ram;  	align_start = res->start & ~(SECTION_SIZE - 1);  	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -409,8 +416,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)  		list_del(&page->lru);  		page->pgmap = pgmap;  		percpu_ref_get(pgmap->ref); -		if (!(++i % 1024)) -			cond_resched();  	}  	devm_add_action(dev, devm_memremap_pages_release, pgmap); @@ -422,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)   err_pfn_remap:   err_radix:  	pgmap_radix_release(res, pgoff); -	devres_free(pgmap);  	return ERR_PTR(error);  }  EXPORT_SYMBOL(devm_memremap_pages); diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..e42764acedb4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file)  		m->private = kallsyms_show_value() ? NULL : (void *)8ul;  	} -	return 0; +	return err;  }  static const struct file_operations proc_modules_operations = { diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..4b794f1d8561 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -640,7 +640,7 @@ device_initcall(register_warn_debugfs);   */  __visible void __stack_chk_fail(void)  { -	panic("stack-protector: Kernel stack is corrupted in: %p\n", +	panic("stack-protector: Kernel stack is corrupted in: %pB\n",  		__builtin_return_address(0));  }  EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc1123583fa6..f274fbef821d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2397,7 +2397,7 @@ skip:  		if (console_lock_spinning_disable_and_check()) {  			printk_safe_exit_irqrestore(flags); -			return; +			goto out;  		}  		printk_safe_exit_irqrestore(flags); @@ -2430,6 +2430,7 @@ skip:  	if (retry && console_trylock())  		goto again; +out:  	if (wake_klogd)  		wake_up_klogd();  } diff --git a/kernel/relay.c b/kernel/relay.c index c3029402f15c..c955b10c973c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -163,7 +163,7 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)  {  	struct rchan_buf *buf; -	if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) +	if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))  		return NULL;  	buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf724c1952ea..c94895bc5a2c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev)  #endif  } -static inline void finish_lock_switch(struct rq *rq) +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)  { +	/* +	 * Since the runqueue lock will be released by the next +	 * task (which is an invalid locking op but in the case +	 * of the scheduler it's an obvious special-case), so we +	 * do an early lockdep release here: +	 */ +	rq_unpin_lock(rq, rf); +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);  #ifdef CONFIG_DEBUG_SPINLOCK  	/* this is a valid case when another task releases the spinlock */ -	rq->lock.owner = current; +	rq->lock.owner = next;  #endif +} + +static inline void finish_lock_switch(struct rq *rq) +{  	/*  	 * If we are tracking spinlock dependencies then we have to  	 * fix up the runqueue lock - which gets 'carried over' from  	 * prev into current:  	 */  	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -  	raw_spin_unlock_irq(&rq->lock);  } @@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); -	/* -	 * Since the runqueue lock will be released by the next -	 * task (which is an invalid locking op but in the case -	 * of the scheduler it's an obvious special-case), so we -	 * do an early lockdep release here: -	 */ -	rq_unpin_lock(rq, rf); -	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +	prepare_lock_switch(rq, next, rf);  	/* Here we just switch the register state and the stack. */  	switch_to(prev, next, prev); @@ -6678,13 +6683,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)  		parent_quota = parent_b->hierarchical_quota;  		/* -		 * Ensure max(child_quota) <= parent_quota, inherit when no +		 * Ensure max(child_quota) <= parent_quota.  On cgroup2, +		 * always take the min.  On cgroup1, only inherit when no  		 * limit is set:  		 */ -		if (quota == RUNTIME_INF) -			quota = parent_quota; -		else if (parent_quota != RUNTIME_INF && quota > parent_quota) -			return -EINVAL; +		if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { +			quota = min(quota, parent_quota); +		} else { +			if (quota == RUNTIME_INF) +				quota = parent_quota; +			else if (parent_quota != RUNTIME_INF && quota > parent_quota) +				return -EINVAL; +		}  	}  	cfs_b->hierarchical_quota = quota; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dd062a1c8cf0..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@  #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY	50 -  struct sugov_tunables {  	struct gov_attr_set attr_set;  	unsigned int rate_limit_us; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9bb0e0c412ec..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq)  	struct sched_dl_entity *dl_se = &curr->dl;  	u64 delta_exec, scaled_delta_exec;  	int cpu = cpu_of(rq); +	u64 now;  	if (!dl_task(curr) || !on_dl_rq(dl_se))  		return; @@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq)  	 * natural solution, but the full ramifications of this  	 * approach need further study.  	 */ -	delta_exec = rq_clock_task(rq) - curr->se.exec_start; +	now = rq_clock_task(rq); +	delta_exec = now - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0)) {  		if (unlikely(dl_se->dl_yielded))  			goto throttle; @@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq)  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); -	curr->se.exec_start = rq_clock_task(rq); +	curr->se.exec_start = now;  	cgroup_account_cputime(curr, delta_exec);  	sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..72c401b3b15c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);  	if (m)					\  		seq_printf(m, x);		\  	else					\ -		printk(x);			\ +		pr_cont(x);			\   } while (0)  /* @@ -501,12 +501,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)  {  	struct task_struct *g, *p; -	SEQ_printf(m, -	"\nrunnable tasks:\n" -	" S           task   PID         tree-key  switches  prio" -	"     wait-time             sum-exec        sum-sleep\n" -	"-------------------------------------------------------" -	"----------------------------------------------------\n"); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "runnable tasks:\n"); +	SEQ_printf(m, " S           task   PID         tree-key  switches  prio" +		   "     wait-time             sum-exec        sum-sleep\n"); +	SEQ_printf(m, "-------------------------------------------------------" +		   "----------------------------------------------------\n");  	rcu_read_lock();  	for_each_process_thread(g, p) { @@ -527,9 +527,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  	unsigned long flags;  #ifdef CONFIG_FAIR_GROUP_SCHED -	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));  #else -	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "cfs_rq[%d]:\n", cpu);  #endif  	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",  			SPLIT_NS(cfs_rq->exec_clock)); @@ -595,9 +597,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)  {  #ifdef CONFIG_RT_GROUP_SCHED -	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));  #else -	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "rt_rq[%d]:\n", cpu);  #endif  #define P(x) \ @@ -624,7 +628,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)  {  	struct dl_bw *dl_bw; -	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "dl_rq[%d]:\n", cpu);  #define PU(x) \  	SEQ_printf(m, "  .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 663b2355a3aa..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)  {  	struct task_struct *curr = rq->curr;  	struct sched_rt_entity *rt_se = &curr->rt; -	u64 now = rq_clock_task(rq);  	u64 delta_exec; +	u64 now;  	if (curr->sched_class != &rt_sched_class)  		return; +	now = rq_clock_task(rq);  	delta_exec = now - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0))  		return; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 940fa408a288..dc77548167ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1076,14 +1076,16 @@ long seccomp_get_metadata(struct task_struct *task,  	size = min_t(unsigned long, size, sizeof(kmd)); -	if (copy_from_user(&kmd, data, size)) +	if (size < sizeof(kmd.filter_off)) +		return -EINVAL; + +	if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))  		return -EFAULT;  	filter = get_nth_filter(task, kmd.filter_off);  	if (IS_ERR(filter))  		return PTR_ERR(filter); -	memset(&kmd, 0, sizeof(kmd));  	if (filter->log)  		kmd.flags |= SECCOMP_FILTER_FLAG_LOG; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..10b7186d0638 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -50,6 +50,7 @@  #include <linux/export.h>  #include <linux/hashtable.h>  #include <linux/compat.h> +#include <linux/nospec.h>  #include "timekeeping.h"  #include "posix-timers.h" @@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = {  static const struct k_clock *clockid_to_kclock(const clockid_t id)  { -	if (id < 0) +	clockid_t idx = id; + +	if (id < 0) {  		return (id & CLOCKFD_MASK) == CLOCKFD ?  			&clock_posix_dynamic : &clock_posix_cpu; +	} -	if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) +	if (id >= ARRAY_SIZE(posix_clocks))  		return NULL; -	return posix_clocks[id]; + +	return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];  } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48150ab42de9..4a4fd567fb26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1894,6 +1894,12 @@ int timers_dead_cpu(unsigned int cpu)  		raw_spin_lock_irq(&new_base->lock);  		raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); +		/* +		 * The current CPUs base clock might be stale. Update it +		 * before moving the timers over. +		 */ +		forward_timer_base(new_base); +  		BUG_ON(old_base->running_timer);  		for (i = 0; i < WHEEL_SIZE; i++) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fc2838ac8b78..01e6b3a38871 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {  	.arg3_type	= ARG_ANYTHING,  }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ +	switch (func_id) { +	case BPF_FUNC_perf_event_output: +		return &bpf_perf_event_output_proto_tp; +	case BPF_FUNC_get_stackid: +		return &bpf_get_stackid_proto_tp; +	default: +		return tracing_func_proto(func_id); +	} +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, +				    struct bpf_insn_access_aux *info) +{ +	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) +		return false; +	if (type != BPF_READ) +		return false; +	if (off % size != 0) +		return false; + +	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); +	return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { +	.get_func_proto  = tp_prog_func_proto, +	.is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,  	   struct bpf_perf_event_value *, buf, u32, size)  {  	int err = -EINVAL; @@ -678,8 +712,8 @@ clear:  	return err;  } -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { -         .func           = bpf_perf_prog_read_value_tp, +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { +         .func           = bpf_perf_prog_read_value,           .gpl_only       = true,           .ret_type       = RET_INTEGER,           .arg1_type      = ARG_PTR_TO_CTX, @@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {           .arg3_type      = ARG_CONST_SIZE,  }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id)  {  	switch (func_id) {  	case BPF_FUNC_perf_event_output: @@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)  	case BPF_FUNC_get_stackid:  		return &bpf_get_stackid_proto_tp;  	case BPF_FUNC_perf_prog_read_value: -		return &bpf_perf_prog_read_value_proto_tp; +		return &bpf_perf_prog_read_value_proto;  	default:  		return tracing_func_proto(func_id);  	}  } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, -				    struct bpf_insn_access_aux *info) -{ -	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) -		return false; -	if (type != BPF_READ) -		return false; -	if (off % size != 0) -		return false; - -	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); -	return true; -} - -const struct bpf_verifier_ops tracepoint_verifier_ops = { -	.get_func_proto  = tp_prog_func_proto, -	.is_valid_access = tp_prog_is_valid_access, -}; - -const struct bpf_prog_ops tracepoint_prog_ops = { -}; -  static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,  				    struct bpf_insn_access_aux *info)  { @@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,  }  const struct bpf_verifier_ops perf_event_verifier_ops = { -	.get_func_proto		= tp_prog_func_proto, +	.get_func_proto		= pe_prog_func_proto,  	.is_valid_access	= pe_prog_is_valid_access,  	.convert_ctx_access	= pe_prog_convert_ctx_access,  }; @@ -872,6 +884,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)  		return -EINVAL;  	if (copy_from_user(&query, uquery, sizeof(query)))  		return -EFAULT; +	if (query.ids_len > BPF_TRACE_MAX_PROGS) +		return -E2BIG;  	mutex_lock(&bpf_event_mutex);  	ret = bpf_prog_array_copy_info(event->tp_event->prog_array, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1fad24acd444..ae4147eaebd4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -659,7 +659,7 @@ static int create_trace_kprobe(int argc, char **argv)  	char *symbol = NULL, *event = NULL, *group = NULL;  	int maxactive = 0;  	char *arg; -	unsigned long offset = 0; +	long offset = 0;  	void *addr = NULL;  	char buf[MAX_EVENT_NAME_LEN]; @@ -747,7 +747,7 @@ static int create_trace_kprobe(int argc, char **argv)  		symbol = argv[1];  		/* TODO: support .init module functions */  		ret = traceprobe_split_symbol_offset(symbol, &offset); -		if (ret) { +		if (ret || offset < 0 || offset > UINT_MAX) {  			pr_info("Failed to parse either an address or a symbol.\n");  			return ret;  		} diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d59357308677..daf54bda4dc8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type,  }  /* Split symbol and offset. */ -int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +int traceprobe_split_symbol_offset(char *symbol, long *offset)  {  	char *tmp;  	int ret; @@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)  	if (!offset)  		return -EINVAL; -	tmp = strchr(symbol, '+'); +	tmp = strpbrk(symbol, "+-");  	if (tmp) { -		/* skip sign because kstrtoul doesn't accept '+' */ -		ret = kstrtoul(tmp + 1, 0, offset); +		ret = kstrtol(tmp, 0, offset);  		if (ret)  			return ret; -  		*tmp = '\0';  	} else  		*offset = 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e101c5bb9eda..6a4d3fa94042 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name,  extern void traceprobe_update_arg(struct probe_arg *arg);  extern void traceprobe_free_probe_arg(struct probe_arg *arg); -extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); +extern int traceprobe_split_symbol_offset(char *symbol, long *offset);  /* Sum up total data length for dynamic arraies (strings) */  static nokprobe_inline int diff --git a/kernel/user.c b/kernel/user.c index 9a20acce460d..36288d840675 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,7 @@ struct user_struct root_user = {  	.sigpending	= ATOMIC_INIT(0),  	.locked_shm     = 0,  	.uid		= GLOBAL_ROOT_UID, +	.ratelimit	= RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),  };  /* @@ -191,6 +192,8 @@ struct user_struct *alloc_uid(kuid_t uid)  		new->uid = uid;  		atomic_set(&new->__count, 1); +		ratelimit_state_init(&new->ratelimit, HZ, 100); +		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);  		/*  		 * Before adding this, check whether we raced diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 017044c26233..6ec6ba65127b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)  	return ret;  } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ -	return __cancel_work(work, false); -} -  /**   * cancel_delayed_work - cancel a delayed work   * @dwork: delayed_work to cancel @@ -4180,6 +4172,22 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)  EXPORT_SYMBOL_GPL(workqueue_set_max_active);  /** + * current_work - retrieve %current task's work struct + * + * Determine if %current task is a workqueue worker and what it's working on. + * Useful to find out the context that the %current task is running in. + * + * Return: work struct if %current task is a workqueue worker, %NULL otherwise. + */ +struct work_struct *current_work(void) +{ +	struct worker *worker = current_wq_worker(); + +	return worker ? worker->current_work : NULL; +} +EXPORT_SYMBOL(current_work); + +/**   * current_is_workqueue_rescuer - is %current workqueue rescuer?   *   * Determine whether %current is a workqueue rescuer.  Can be used from @@ -5321,7 +5329,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)  	ret = device_register(&wq_dev->dev);  	if (ret) { -		kfree(wq_dev); +		put_device(&wq_dev->dev);  		wq->wq_dev = NULL;  		return ret;  	}  | 

