diff options
| author | Mark Brown <broonie@kernel.org> | 2017-12-04 17:59:52 +0000 | 
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2017-12-04 17:59:52 +0000 | 
| commit | d218439feccafaa3e852bbaecc0d9f6110096b65 (patch) | |
| tree | c4ebfeaebd965735cea5d90332d0f8013b79e628 /kernel/bpf | |
| parent | a76d7f5454c688b52dc849e832cc4c6dd0975723 (diff) | |
| parent | fdaa451107ce543d345a339b4d5e20e8e4bac396 (diff) | |
| download | talos-op-linux-d218439feccafaa3e852bbaecc0d9f6110096b65.tar.gz talos-op-linux-d218439feccafaa3e852bbaecc0d9f6110096b65.zip  | |
Merge branch 'fix/amd' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into asoc-amd
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 4 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 10 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 570 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 186 | ||||
| -rw-r--r-- | kernel/bpf/cpumap.c | 706 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 15 | ||||
| -rw-r--r-- | kernel/bpf/disasm.c | 214 | ||||
| -rw-r--r-- | kernel/bpf/disasm.h | 32 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 9 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 98 | ||||
| -rw-r--r-- | kernel/bpf/offload.c | 191 | ||||
| -rw-r--r-- | kernel/bpf/percpu_freelist.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/sockmap.c | 62 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 315 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 1523 | 
17 files changed, 3079 insertions, 885 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..e691da0b3bab 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o  ifeq ($(CONFIG_NET),y)  obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += offload.o  ifeq ($(CONFIG_STREAM_PARSER),y)  obj-$(CONFIG_BPF_SYSCALL) += sockmap.o  endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..7c25426d3cf5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -19,6 +19,9 @@  #include "map_in_map.h" +#define ARRAY_CREATE_FLAG_MASK \ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +  static void bpf_array_free_percpu(struct bpf_array *array)  {  	int i; @@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || -	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || +	    attr->value_size == 0 || +	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||  	    (percpu && numa_node != NUMA_NO_NODE))  		return ERR_PTR(-EINVAL); @@ -98,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();  	if (array_size >= U32_MAX - PAGE_SIZE || -	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { +	    bpf_array_alloc_percpu(array)) {  		bpf_map_area_free(array);  		return ERR_PTR(-ENOMEM);  	} @@ -492,7 +496,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,  	ee = ERR_PTR(-EOPNOTSUPP);  	event = perf_file->private_data; -	if (perf_event_read_local(event, &value) == -EOPNOTSUPP) +	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)  		goto err_out;  	ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..b789ab78d28f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)  {  	unsigned int type; -	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { -		struct bpf_prog *prog = cgrp->bpf.prog[type]; - -		if (prog) { -			bpf_prog_put(prog); +	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { +		struct list_head *progs = &cgrp->bpf.progs[type]; +		struct bpf_prog_list *pl, *tmp; + +		list_for_each_entry_safe(pl, tmp, progs, node) { +			list_del(&pl->node); +			bpf_prog_put(pl->prog); +			kfree(pl);  			static_branch_dec(&cgroup_bpf_enabled_key);  		} +		bpf_prog_array_free(cgrp->bpf.effective[type]); +	} +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ +	struct bpf_prog_list *pl; +	u32 cnt = 0; + +	list_for_each_entry(pl, head, node) { +		if (!pl->prog) +			continue; +		cnt++;  	} +	return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, +				    enum bpf_attach_type type, +				    u32 new_flags) +{ +	struct cgroup *p; + +	p = cgroup_parent(cgrp); +	if (!p) +		return true; +	do { +		u32 flags = p->bpf.flags[type]; +		u32 cnt; + +		if (flags & BPF_F_ALLOW_MULTI) +			return true; +		cnt = prog_list_length(&p->bpf.progs[type]); +		WARN_ON_ONCE(cnt > 1); +		if (cnt == 1) +			return !!(flags & BPF_F_ALLOW_OVERRIDE); +		p = cgroup_parent(p); +	} while (p); +	return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, +				   enum bpf_attach_type type, +				   struct bpf_prog_array __rcu **array) +{ +	struct bpf_prog_array __rcu *progs; +	struct bpf_prog_list *pl; +	struct cgroup *p = cgrp; +	int cnt = 0; + +	/* count number of effective programs by walking parents */ +	do { +		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) +			cnt += prog_list_length(&p->bpf.progs[type]); +		p = cgroup_parent(p); +	} while (p); + +	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); +	if (!progs) +		return -ENOMEM; + +	/* populate the array with effective progs */ +	cnt = 0; +	p = cgrp; +	do { +		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) +			list_for_each_entry(pl, +					    &p->bpf.progs[type], node) { +				if (!pl->prog) +					continue; +				rcu_dereference_protected(progs, 1)-> +					progs[cnt++] = pl->prog; +			} +		p = cgroup_parent(p); +	} while (p); + +	*array = progs; +	return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, +				     enum bpf_attach_type type, +				     struct bpf_prog_array __rcu *array) +{ +	struct bpf_prog_array __rcu *old_array; + +	old_array = xchg(&cgrp->bpf.effective[type], array); +	/* free prog array after grace period, since __cgroup_bpf_run_*() +	 * might be still walking the array +	 */ +	bpf_prog_array_free(old_array);  }  /**   * cgroup_bpf_inherit() - inherit effective programs from parent   * @cgrp: the cgroup to modify - * @parent: the parent to inherit from   */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp)  { -	unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define	NR ARRAY_SIZE(cgrp->bpf.effective) +	struct bpf_prog_array __rcu *arrays[NR] = {}; +	int i; -	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { -		struct bpf_prog *e; +	for (i = 0; i < NR; i++) +		INIT_LIST_HEAD(&cgrp->bpf.progs[i]); -		e = rcu_dereference_protected(parent->bpf.effective[type], -					      lockdep_is_held(&cgroup_mutex)); -		rcu_assign_pointer(cgrp->bpf.effective[type], e); -		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; -	} +	for (i = 0; i < NR; i++) +		if (compute_effective_progs(cgrp, i, &arrays[i])) +			goto cleanup; + +	for (i = 0; i < NR; i++) +		activate_effective_progs(cgrp, i, arrays[i]); + +	return 0; +cleanup: +	for (i = 0; i < NR; i++) +		bpf_prog_array_free(arrays[i]); +	return -ENOMEM;  } +#define BPF_CGROUP_MAX_PROGS 64 +  /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and   *                         propagate the change to descendants   * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation   *   * Must be called with cgroup_mutex held.   */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, -			struct bpf_prog *prog, enum bpf_attach_type type, -			bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, +			enum bpf_attach_type type, u32 flags)  { -	struct bpf_prog *old_prog, *effective = NULL; -	struct cgroup_subsys_state *pos; -	bool overridable = true; - -	if (parent) { -		overridable = !parent->bpf.disallow_override[type]; -		effective = rcu_dereference_protected(parent->bpf.effective[type], -						      lockdep_is_held(&cgroup_mutex)); -	} - -	if (prog && effective && !overridable) -		/* if parent has non-overridable prog attached, disallow -		 * attaching new programs to descendent cgroup -		 */ +	struct list_head *progs = &cgrp->bpf.progs[type]; +	struct bpf_prog *old_prog = NULL; +	struct cgroup_subsys_state *css; +	struct bpf_prog_list *pl; +	bool pl_was_allocated; +	int err; + +	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) +		/* invalid combination */ +		return -EINVAL; + +	if (!hierarchy_allows_attach(cgrp, type, flags))  		return -EPERM; -	if (prog && effective && overridable != new_overridable) -		/* if parent has overridable prog attached, only -		 * allow overridable programs in descendent cgroup +	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) +		/* Disallow attaching non-overridable on top +		 * of existing overridable in this cgroup. +		 * Disallow attaching multi-prog if overridable or none  		 */  		return -EPERM; -	old_prog = cgrp->bpf.prog[type]; - -	if (prog) { -		overridable = new_overridable; -		effective = prog; -		if (old_prog && -		    cgrp->bpf.disallow_override[type] == new_overridable) -			/* disallow attaching non-overridable on top -			 * of existing overridable in this cgroup -			 * and vice versa -			 */ -			return -EPERM; +	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) +		return -E2BIG; + +	if (flags & BPF_F_ALLOW_MULTI) { +		list_for_each_entry(pl, progs, node) +			if (pl->prog == prog) +				/* disallow attaching the same prog twice */ +				return -EINVAL; + +		pl = kmalloc(sizeof(*pl), GFP_KERNEL); +		if (!pl) +			return -ENOMEM; +		pl_was_allocated = true; +		pl->prog = prog; +		list_add_tail(&pl->node, progs); +	} else { +		if (list_empty(progs)) { +			pl = kmalloc(sizeof(*pl), GFP_KERNEL); +			if (!pl) +				return -ENOMEM; +			pl_was_allocated = true; +			list_add_tail(&pl->node, progs); +		} else { +			pl = list_first_entry(progs, typeof(*pl), node); +			old_prog = pl->prog; +			pl_was_allocated = false; +		} +		pl->prog = prog;  	} -	if (!prog && !old_prog) -		/* report error when trying to detach and nothing is attached */ -		return -ENOENT; +	cgrp->bpf.flags[type] = flags; -	cgrp->bpf.prog[type] = prog; +	/* allocate and recompute effective prog arrays */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); -	css_for_each_descendant_pre(pos, &cgrp->self) { -		struct cgroup *desc = container_of(pos, struct cgroup, self); - -		/* skip the subtree if the descendant has its own program */ -		if (desc->bpf.prog[type] && desc != cgrp) { -			pos = css_rightmost_descendant(pos); -		} else { -			rcu_assign_pointer(desc->bpf.effective[type], -					   effective); -			desc->bpf.disallow_override[type] = !overridable; -		} +		err = compute_effective_progs(desc, type, &desc->bpf.inactive); +		if (err) +			goto cleanup;  	} -	if (prog) -		static_branch_inc(&cgroup_bpf_enabled_key); +	/* all allocations were successful. Activate all prog arrays */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); +		activate_effective_progs(desc, type, desc->bpf.inactive); +		desc->bpf.inactive = NULL; +	} + +	static_branch_inc(&cgroup_bpf_enabled_key);  	if (old_prog) {  		bpf_prog_put(old_prog);  		static_branch_dec(&cgroup_bpf_enabled_key);  	}  	return 0; + +cleanup: +	/* oom while computing effective. Free all computed effective arrays +	 * since they were not activated +	 */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); + +		bpf_prog_array_free(desc->bpf.inactive); +		desc->bpf.inactive = NULL; +	} + +	/* and cleanup the prog list */ +	pl->prog = old_prog; +	if (pl_was_allocated) { +		list_del(&pl->node); +		kfree(pl); +	} +	return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + *                         propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, +			enum bpf_attach_type type, u32 unused_flags) +{ +	struct list_head *progs = &cgrp->bpf.progs[type]; +	u32 flags = cgrp->bpf.flags[type]; +	struct bpf_prog *old_prog = NULL; +	struct cgroup_subsys_state *css; +	struct bpf_prog_list *pl; +	int err; + +	if (flags & BPF_F_ALLOW_MULTI) { +		if (!prog) +			/* to detach MULTI prog the user has to specify valid FD +			 * of the program to be detached +			 */ +			return -EINVAL; +	} else { +		if (list_empty(progs)) +			/* report error when trying to detach and nothing is attached */ +			return -ENOENT; +	} + +	if (flags & BPF_F_ALLOW_MULTI) { +		/* find the prog and detach it */ +		list_for_each_entry(pl, progs, node) { +			if (pl->prog != prog) +				continue; +			old_prog = prog; +			/* mark it deleted, so it's ignored while +			 * recomputing effective +			 */ +			pl->prog = NULL; +			break; +		} +		if (!old_prog) +			return -ENOENT; +	} else { +		/* to maintain backward compatibility NONE and OVERRIDE cgroups +		 * allow detaching with invalid FD (prog==NULL) +		 */ +		pl = list_first_entry(progs, typeof(*pl), node); +		old_prog = pl->prog; +		pl->prog = NULL; +	} + +	/* allocate and recompute effective prog arrays */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); + +		err = compute_effective_progs(desc, type, &desc->bpf.inactive); +		if (err) +			goto cleanup; +	} + +	/* all allocations were successful. Activate all prog arrays */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); + +		activate_effective_progs(desc, type, desc->bpf.inactive); +		desc->bpf.inactive = NULL; +	} + +	/* now can actually delete it from this cgroup list */ +	list_del(&pl->node); +	kfree(pl); +	if (list_empty(progs)) +		/* last program was detached, reset flags to zero */ +		cgrp->bpf.flags[type] = 0; + +	bpf_prog_put(old_prog); +	static_branch_dec(&cgroup_bpf_enabled_key); +	return 0; + +cleanup: +	/* oom while computing effective. Free all computed effective arrays +	 * since they were not activated +	 */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		struct cgroup *desc = container_of(css, struct cgroup, self); + +		bpf_prog_array_free(desc->bpf.inactive); +		desc->bpf.inactive = NULL; +	} + +	/* and restore back old_prog */ +	pl->prog = old_prog; +	return err; +} + +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, +		       union bpf_attr __user *uattr) +{ +	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); +	enum bpf_attach_type type = attr->query.attach_type; +	struct list_head *progs = &cgrp->bpf.progs[type]; +	u32 flags = cgrp->bpf.flags[type]; +	int cnt, ret = 0, i; + +	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) +		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); +	else +		cnt = prog_list_length(progs); + +	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) +		return -EFAULT; +	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) +		return -EFAULT; +	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) +		/* return early if user requested only program count + flags */ +		return 0; +	if (attr->query.prog_cnt < cnt) { +		cnt = attr->query.prog_cnt; +		ret = -ENOSPC; +	} + +	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { +		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], +						   prog_ids, cnt); +	} else { +		struct bpf_prog_list *pl; +		u32 id; + +		i = 0; +		list_for_each_entry(pl, progs, node) { +			id = pl->prog->aux->id; +			if (copy_to_user(prog_ids + i, &id, sizeof(id))) +				return -EFAULT; +			if (++i == cnt) +				break; +		} +	} +	return ret;  }  /** @@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,  				struct sk_buff *skb,  				enum bpf_attach_type type)  { -	struct bpf_prog *prog; +	unsigned int offset = skb->data - skb_network_header(skb); +	struct sock *save_sk;  	struct cgroup *cgrp; -	int ret = 0; +	int ret;  	if (!sk || !sk_fullsock(sk))  		return 0; -	if (sk->sk_family != AF_INET && -	    sk->sk_family != AF_INET6) +	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)  		return 0;  	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - -	rcu_read_lock(); - -	prog = rcu_dereference(cgrp->bpf.effective[type]); -	if (prog) { -		unsigned int offset = skb->data - skb_network_header(skb); -		struct sock *save_sk = skb->sk; - -		skb->sk = sk; -		__skb_push(skb, offset); -		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; -		__skb_pull(skb, offset); -		skb->sk = save_sk; -	} - -	rcu_read_unlock(); - -	return ret; +	save_sk = skb->sk; +	skb->sk = sk; +	__skb_push(skb, offset); +	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, +				 bpf_prog_run_save_cb); +	__skb_pull(skb, offset); +	skb->sk = save_sk; +	return ret == 1 ? 0 : -EPERM;  }  EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,  			       enum bpf_attach_type type)  {  	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); -	struct bpf_prog *prog; -	int ret = 0; +	int ret; - -	rcu_read_lock(); - -	prog = rcu_dereference(cgrp->bpf.effective[type]); -	if (prog) -		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; - -	rcu_read_unlock(); - -	return ret; +	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); +	return ret == 1 ? 0 : -EPERM;  }  EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,  				     enum bpf_attach_type type)  {  	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); -	struct bpf_prog *prog; -	int ret = 0; +	int ret; +	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, +				 BPF_PROG_RUN); +	return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, +				      short access, enum bpf_attach_type type) +{ +	struct cgroup *cgrp; +	struct bpf_cgroup_dev_ctx ctx = { +		.access_type = (access << 16) | dev_type, +		.major = major, +		.minor = minor, +	}; +	int allow = 1;  	rcu_read_lock(); +	cgrp = task_dfl_cgroup(current); +	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, +				   BPF_PROG_RUN); +	rcu_read_unlock(); -	prog = rcu_dereference(cgrp->bpf.effective[type]); -	if (prog) -		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; +	return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); -	rcu_read_unlock(); +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ +	switch (func_id) { +	case BPF_FUNC_map_lookup_elem: +		return &bpf_map_lookup_elem_proto; +	case BPF_FUNC_map_update_elem: +		return &bpf_map_update_elem_proto; +	case BPF_FUNC_map_delete_elem: +		return &bpf_map_delete_elem_proto; +	case BPF_FUNC_get_current_uid_gid: +		return &bpf_get_current_uid_gid_proto; +	case BPF_FUNC_trace_printk: +		if (capable(CAP_SYS_ADMIN)) +			return bpf_get_trace_printk_proto(); +	default: +		return NULL; +	} +} -	return ret; +static bool cgroup_dev_is_valid_access(int off, int size, +				       enum bpf_access_type type, +				       struct bpf_insn_access_aux *info) +{ +	if (type == BPF_WRITE) +		return false; + +	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) +		return false; +	/* The verifier guarantees that size > 0. */ +	if (off % size != 0) +		return false; +	if (size != sizeof(__u32)) +		return false; + +	return true;  } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { +	.get_func_proto		= cgroup_dev_func_proto, +	.is_valid_access	= cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b62df86be1d..b9f8686a84cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)  	if (fp == NULL)  		return NULL; -	kmemcheck_annotate_bitfield(fp, meta); -  	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);  	if (aux == NULL) {  		vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,  	if (fp == NULL) {  		__bpf_prog_uncharge(fp_old->aux->user, delta);  	} else { -		kmemcheck_annotate_bitfield(fp, meta); -  		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);  		fp->pages = pages;  		fp->aux->prog = fp; @@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,  static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)  { +	const char *end = sym + KSYM_NAME_LEN; +  	BUILD_BUG_ON(sizeof("bpf_prog_") + -		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); +		     sizeof(prog->tag) * 2 + +		     /* name has been null terminated. +		      * We should need +1 for the '_' preceding +		      * the name.  However, the null character +		      * is double counted between the name and the +		      * sizeof("bpf_prog_") above, so we omit +		      * the +1 here. +		      */ +		     sizeof(prog->aux->name) > KSYM_NAME_LEN);  	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");  	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag)); -	*sym = 0; +	if (prog->aux->name[0]) +		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); +	else +		*sym = 0;  }  static __always_inline unsigned long @@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,  	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);  	if (fp != NULL) { -		kmemcheck_annotate_bitfield(fp, meta); -  		/* aux->prog still points to the fp_other one, so  		 * when promoting the clone to the real program,  		 * this still needs to be adapted. @@ -1367,7 +1374,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)  	 * valid program, which in this case would simply not  	 * be JITed, but falls back to the interpreter.  	 */ -	fp = bpf_int_jit_compile(fp); +	if (!bpf_prog_is_dev_bound(fp->aux)) { +		fp = bpf_int_jit_compile(fp); +	} else { +		*err = bpf_prog_offload_compile(fp); +		if (*err) +			return fp; +	}  	bpf_prog_lock_ro(fp);  	/* The tail call compatibility check can only be done at @@ -1381,11 +1394,163 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)  }  EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +static unsigned int __bpf_prog_ret1(const void *ctx, +				    const struct bpf_insn *insn) +{ +	return 1; +} + +static struct bpf_prog_dummy { +	struct bpf_prog prog; +} dummy_bpf_prog = { +	.prog = { +		.bpf_func = __bpf_prog_ret1, +	}, +}; + +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { +	struct bpf_prog_array hdr; +	struct bpf_prog *null_prog; +} empty_prog_array = { +	.null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ +	if (prog_cnt) +		return kzalloc(sizeof(struct bpf_prog_array) + +			       sizeof(struct bpf_prog *) * (prog_cnt + 1), +			       flags); + +	return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ +	if (!progs || +	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) +		return; +	kfree_rcu(progs, rcu); +} + +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ +	struct bpf_prog **prog; +	u32 cnt = 0; + +	rcu_read_lock(); +	prog = rcu_dereference(progs)->progs; +	for (; *prog; prog++) +		cnt++; +	rcu_read_unlock(); +	return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +				__u32 __user *prog_ids, u32 cnt) +{ +	struct bpf_prog **prog; +	u32 i = 0, id; + +	rcu_read_lock(); +	prog = rcu_dereference(progs)->progs; +	for (; *prog; prog++) { +		id = (*prog)->aux->id; +		if (copy_to_user(prog_ids + i, &id, sizeof(id))) { +			rcu_read_unlock(); +			return -EFAULT; +		} +		if (++i == cnt) { +			prog++; +			break; +		} +	} +	rcu_read_unlock(); +	if (*prog) +		return -ENOSPC; +	return 0; +} + +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +				struct bpf_prog *old_prog) +{ +	struct bpf_prog **prog = progs->progs; + +	for (; *prog; prog++) +		if (*prog == old_prog) { +			WRITE_ONCE(*prog, &dummy_bpf_prog.prog); +			break; +		} +} + +int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +			struct bpf_prog *exclude_prog, +			struct bpf_prog *include_prog, +			struct bpf_prog_array **new_array) +{ +	int new_prog_cnt, carry_prog_cnt = 0; +	struct bpf_prog **existing_prog; +	struct bpf_prog_array *array; +	int new_prog_idx = 0; + +	/* Figure out how many existing progs we need to carry over to +	 * the new array. +	 */ +	if (old_array) { +		existing_prog = old_array->progs; +		for (; *existing_prog; existing_prog++) { +			if (*existing_prog != exclude_prog && +			    *existing_prog != &dummy_bpf_prog.prog) +				carry_prog_cnt++; +			if (*existing_prog == include_prog) +				return -EEXIST; +		} +	} + +	/* How many progs (not NULL) will be in the new array? */ +	new_prog_cnt = carry_prog_cnt; +	if (include_prog) +		new_prog_cnt += 1; + +	/* Do we have any prog (not NULL) in the new array? */ +	if (!new_prog_cnt) { +		*new_array = NULL; +		return 0; +	} + +	/* +1 as the end of prog_array is marked with NULL */ +	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); +	if (!array) +		return -ENOMEM; + +	/* Fill in the new prog array */ +	if (carry_prog_cnt) { +		existing_prog = old_array->progs; +		for (; *existing_prog; existing_prog++) +			if (*existing_prog != exclude_prog && +			    *existing_prog != &dummy_bpf_prog.prog) +				array->progs[new_prog_idx++] = *existing_prog; +	} +	if (include_prog) +		array->progs[new_prog_idx++] = include_prog; +	array->progs[new_prog_idx] = NULL; +	*new_array = array; +	return 0; +} +  static void bpf_prog_free_deferred(struct work_struct *work)  {  	struct bpf_prog_aux *aux;  	aux = container_of(work, struct bpf_prog_aux, work); +	if (bpf_prog_is_dev_bound(aux)) +		bpf_prog_offload_destroy(aux->prog);  	bpf_jit_free(aux->prog);  } @@ -1498,5 +1663,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,  EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL  EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);  EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..ce5b669003b2 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,706 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2.  See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU.  The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage.  This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/ptr_ring.h> + +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/capability.h> +#include <trace/events/xdp.h> + +#include <linux/netdevice.h>   /* netif_receive_skb_core */ +#include <linux/etherdevice.h> /* eth_type_trans */ + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call.  It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { +	void *q[CPU_MAP_BULK_SIZE]; +	unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { +	u32 cpu;    /* kthread CPU and map index */ +	int map_id; /* Back reference to map */ +	u32 qsize;  /* Queue size placeholder for map lookup */ + +	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */ +	struct xdp_bulk_queue __percpu *bulkq; + +	/* Queue with potential multi-producers, and single-consumer kthread */ +	struct ptr_ring *queue; +	struct task_struct *kthread; +	struct work_struct kthread_stop_wq; + +	atomic_t refcnt; /* Control when this struct can be free'ed */ +	struct rcu_head rcu; +}; + +struct bpf_cpu_map { +	struct bpf_map map; +	/* Below members specific for map type */ +	struct bpf_cpu_map_entry **cpu_map; +	unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, +			     struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ +	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ +	struct bpf_cpu_map *cmap; +	int err = -ENOMEM; +	u64 cost; +	int ret; + +	if (!capable(CAP_SYS_ADMIN)) +		return ERR_PTR(-EPERM); + +	/* check sanity of attributes */ +	if (attr->max_entries == 0 || attr->key_size != 4 || +	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) +		return ERR_PTR(-EINVAL); + +	cmap = kzalloc(sizeof(*cmap), GFP_USER); +	if (!cmap) +		return ERR_PTR(-ENOMEM); + +	/* mandatory map attributes */ +	cmap->map.map_type = attr->map_type; +	cmap->map.key_size = attr->key_size; +	cmap->map.value_size = attr->value_size; +	cmap->map.max_entries = attr->max_entries; +	cmap->map.map_flags = attr->map_flags; +	cmap->map.numa_node = bpf_map_attr_numa_node(attr); + +	/* Pre-limit array size based on NR_CPUS, not final CPU check */ +	if (cmap->map.max_entries > NR_CPUS) { +		err = -E2BIG; +		goto free_cmap; +	} + +	/* make sure page count doesn't overflow */ +	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); +	cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); +	if (cost >= U32_MAX - PAGE_SIZE) +		goto free_cmap; +	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + +	/* Notice returns -EPERM on if map size is larger than memlock limit */ +	ret = bpf_map_precharge_memlock(cmap->map.pages); +	if (ret) { +		err = ret; +		goto free_cmap; +	} + +	/* A per cpu bitfield with a bit per possible CPU in map  */ +	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), +					    __alignof__(unsigned long)); +	if (!cmap->flush_needed) +		goto free_cmap; + +	/* Alloc array for possible remote "destination" CPUs */ +	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * +					   sizeof(struct bpf_cpu_map_entry *), +					   cmap->map.numa_node); +	if (!cmap->cpu_map) +		goto free_percpu; + +	return &cmap->map; +free_percpu: +	free_percpu(cmap->flush_needed); +free_cmap: +	kfree(cmap); +	return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ +	/* The tear-down procedure should have made sure that queue is +	 * empty.  See __cpu_map_entry_replace() and work-queue +	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour +	 * gracefully and warn once. +	 */ +	if (WARN_ON_ONCE(ptr)) +		page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ +	if (atomic_dec_and_test(&rcpu->refcnt)) { +		/* The queue should be empty at this point */ +		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); +		kfree(rcpu->queue); +		kfree(rcpu); +	} +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ +	atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ +	struct bpf_cpu_map_entry *rcpu; + +	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + +	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, +	 * as it waits until all in-flight call_rcu() callbacks complete. +	 */ +	rcu_barrier(); + +	/* kthread_stop will wake_up_process and wait for it to complete */ +	kthread_stop(rcpu->kthread); +} + +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { +	void *data; +	u16 len; +	u16 headroom; +	u16 metasize; +	struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ +	struct xdp_pkt *xdp_pkt; +	int metasize; +	int headroom; + +	/* Assure headroom is available for storing info */ +	headroom = xdp->data - xdp->data_hard_start; +	metasize = xdp->data - xdp->data_meta; +	metasize = metasize > 0 ? metasize : 0; +	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) +		return NULL; + +	/* Store info in top of packet */ +	xdp_pkt = xdp->data_hard_start; + +	xdp_pkt->data = xdp->data; +	xdp_pkt->len  = xdp->data_end - xdp->data; +	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); +	xdp_pkt->metasize = metasize; + +	return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, +				  struct xdp_pkt *xdp_pkt) +{ +	unsigned int frame_size; +	void *pkt_data_start; +	struct sk_buff *skb; + +	/* build_skb need to place skb_shared_info after SKB end, and +	 * also want to know the memory "truesize".  Thus, need to +	 * know the memory frame size backing xdp_buff. +	 * +	 * XDP was designed to have PAGE_SIZE frames, but this +	 * assumption is not longer true with ixgbe and i40e.  It +	 * would be preferred to set frame_size to 2048 or 4096 +	 * depending on the driver. +	 *   frame_size = 2048; +	 *   frame_len  = frame_size - sizeof(*xdp_pkt); +	 * +	 * Instead, with info avail, skb_shared_info in placed after +	 * packet len.  This, unfortunately fakes the truesize. +	 * Another disadvantage of this approach, the skb_shared_info +	 * is not at a fixed memory location, with mixed length +	 * packets, which is bad for cache-line hotness. +	 */ +	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + +		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; +	skb = build_skb(pkt_data_start, frame_size); +	if (!skb) +		return NULL; + +	skb_reserve(skb, xdp_pkt->headroom); +	__skb_put(skb, xdp_pkt->len); +	if (xdp_pkt->metasize) +		skb_metadata_set(skb, xdp_pkt->metasize); + +	/* Essential SKB info: protocol and skb->dev */ +	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + +	/* Optional SKB info, currently missing: +	 * - HW checksum info		(skb->ip_summed) +	 * - HW RX hash			(skb_set_hash) +	 * - RX ring dev queue index	(skb_record_rx_queue) +	 */ + +	return skb; +} + +static int cpu_map_kthread_run(void *data) +{ +	struct bpf_cpu_map_entry *rcpu = data; + +	set_current_state(TASK_INTERRUPTIBLE); + +	/* When kthread gives stop order, then rcpu have been disconnected +	 * from map, thus no new packets can enter. Remaining in-flight +	 * per CPU stored packets are flushed to this queue.  Wait honoring +	 * kthread_stop signal until queue is empty. +	 */ +	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { +		unsigned int processed = 0, drops = 0, sched = 0; +		struct xdp_pkt *xdp_pkt; + +		/* Release CPU reschedule checks */ +		if (__ptr_ring_empty(rcpu->queue)) { +			set_current_state(TASK_INTERRUPTIBLE); +			/* Recheck to avoid lost wake-up */ +			if (__ptr_ring_empty(rcpu->queue)) { +				schedule(); +				sched = 1; +			} else { +				__set_current_state(TASK_RUNNING); +			} +		} else { +			sched = cond_resched(); +		} + +		/* Process packets in rcpu->queue */ +		local_bh_disable(); +		/* +		 * The bpf_cpu_map_entry is single consumer, with this +		 * kthread CPU pinned. Lockless access to ptr_ring +		 * consume side valid as no-resize allowed of queue. +		 */ +		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { +			struct sk_buff *skb; +			int ret; + +			skb = cpu_map_build_skb(rcpu, xdp_pkt); +			if (!skb) { +				page_frag_free(xdp_pkt); +				continue; +			} + +			/* Inject into network stack */ +			ret = netif_receive_skb_core(skb); +			if (ret == NET_RX_DROP) +				drops++; + +			/* Limit BH-disable period */ +			if (++processed == 8) +				break; +		} +		/* Feedback loop via tracepoint */ +		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + +		local_bh_enable(); /* resched point, may call do_softirq() */ +	} +	__set_current_state(TASK_RUNNING); + +	put_cpu_map_entry(rcpu); +	return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ +	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; +	struct bpf_cpu_map_entry *rcpu; +	int numa, err; + +	/* Have map->numa_node, but choose node of redirect target CPU */ +	numa = cpu_to_node(cpu); + +	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); +	if (!rcpu) +		return NULL; + +	/* Alloc percpu bulkq */ +	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), +					 sizeof(void *), gfp); +	if (!rcpu->bulkq) +		goto free_rcu; + +	/* Alloc queue */ +	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); +	if (!rcpu->queue) +		goto free_bulkq; + +	err = ptr_ring_init(rcpu->queue, qsize, gfp); +	if (err) +		goto free_queue; + +	rcpu->cpu    = cpu; +	rcpu->map_id = map_id; +	rcpu->qsize  = qsize; + +	/* Setup kthread */ +	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, +					       "cpumap/%d/map:%d", cpu, map_id); +	if (IS_ERR(rcpu->kthread)) +		goto free_ptr_ring; + +	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ +	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + +	/* Make sure kthread runs on a single CPU */ +	kthread_bind(rcpu->kthread, cpu); +	wake_up_process(rcpu->kthread); + +	return rcpu; + +free_ptr_ring: +	ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: +	kfree(rcpu->queue); +free_bulkq: +	free_percpu(rcpu->bulkq); +free_rcu: +	kfree(rcpu); +	return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ +	struct bpf_cpu_map_entry *rcpu; +	int cpu; + +	/* This cpu_map_entry have been disconnected from map and one +	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any +	 * new packets and cannot change/set flush_needed that can +	 * find this entry. +	 */ +	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + +	/* Flush remaining packets in percpu bulkq */ +	for_each_online_cpu(cpu) { +		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + +		/* No concurrent bq_enqueue can run at this point */ +		bq_flush_to_queue(rcpu, bq); +	} +	free_percpu(rcpu->bulkq); +	/* Cannot kthread_stop() here, last put free rcpu resources */ +	put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program.  The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq).  A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue.  Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, +			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ +	struct bpf_cpu_map_entry *old_rcpu; + +	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); +	if (old_rcpu) { +		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); +		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); +		schedule_work(&old_rcpu->kthread_stop_wq); +	} +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	u32 key_cpu = *(u32 *)key; + +	if (key_cpu >= map->max_entries) +		return -EINVAL; + +	/* notice caller map_delete_elem() use preempt_disable() */ +	__cpu_map_entry_replace(cmap, key_cpu, NULL); +	return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, +				u64 map_flags) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	struct bpf_cpu_map_entry *rcpu; + +	/* Array index key correspond to CPU number */ +	u32 key_cpu = *(u32 *)key; +	/* Value is the queue size */ +	u32 qsize = *(u32 *)value; + +	if (unlikely(map_flags > BPF_EXIST)) +		return -EINVAL; +	if (unlikely(key_cpu >= cmap->map.max_entries)) +		return -E2BIG; +	if (unlikely(map_flags == BPF_NOEXIST)) +		return -EEXIST; +	if (unlikely(qsize > 16384)) /* sanity limit on qsize */ +		return -EOVERFLOW; + +	/* Make sure CPU is a valid possible cpu */ +	if (!cpu_possible(key_cpu)) +		return -ENODEV; + +	if (qsize == 0) { +		rcpu = NULL; /* Same as deleting */ +	} else { +		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */ +		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); +		if (!rcpu) +			return -ENOMEM; +	} +	rcu_read_lock(); +	__cpu_map_entry_replace(cmap, key_cpu, rcpu); +	rcu_read_unlock(); +	return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	int cpu; +	u32 i; + +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, +	 * so the bpf programs (can be more than one that used this map) were +	 * disconnected from events. Wait for outstanding critical sections in +	 * these programs to complete. The rcu critical section only guarantees +	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. +	 * It does __not__ ensure pending flush operations (if any) are +	 * complete. +	 */ +	synchronize_rcu(); + +	/* To ensure all pending flush operations have completed wait for flush +	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. +	 * Because the above synchronize_rcu() ensures the map is disconnected +	 * from the program we can assume no new bits will be set. +	 */ +	for_each_online_cpu(cpu) { +		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + +		while (!bitmap_empty(bitmap, cmap->map.max_entries)) +			cond_resched(); +	} + +	/* For cpu_map the remote CPUs can still be using the entries +	 * (struct bpf_cpu_map_entry). +	 */ +	for (i = 0; i < cmap->map.max_entries; i++) { +		struct bpf_cpu_map_entry *rcpu; + +		rcpu = READ_ONCE(cmap->cpu_map[i]); +		if (!rcpu) +			continue; + +		/* bq flush and cleanup happens after RCU graze-period */ +		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ +	} +	free_percpu(cmap->flush_needed); +	bpf_map_area_free(cmap->cpu_map); +	kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	struct bpf_cpu_map_entry *rcpu; + +	if (key >= map->max_entries) +		return NULL; + +	rcpu = READ_ONCE(cmap->cpu_map[key]); +	return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct bpf_cpu_map_entry *rcpu = +		__cpu_map_lookup_elem(map, *(u32 *)key); + +	return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	u32 index = key ? *(u32 *)key : U32_MAX; +	u32 *next = next_key; + +	if (index >= cmap->map.max_entries) { +		*next = 0; +		return 0; +	} + +	if (index == cmap->map.max_entries - 1) +		return -ENOENT; +	*next = index + 1; +	return 0; +} + +const struct bpf_map_ops cpu_map_ops = { +	.map_alloc		= cpu_map_alloc, +	.map_free		= cpu_map_free, +	.map_delete_elem	= cpu_map_delete_elem, +	.map_update_elem	= cpu_map_update_elem, +	.map_lookup_elem	= cpu_map_lookup_elem, +	.map_get_next_key	= cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, +			     struct xdp_bulk_queue *bq) +{ +	unsigned int processed = 0, drops = 0; +	const int to_cpu = rcpu->cpu; +	struct ptr_ring *q; +	int i; + +	if (unlikely(!bq->count)) +		return 0; + +	q = rcpu->queue; +	spin_lock(&q->producer_lock); + +	for (i = 0; i < bq->count; i++) { +		void *xdp_pkt = bq->q[i]; +		int err; + +		err = __ptr_ring_produce(q, xdp_pkt); +		if (err) { +			drops++; +			page_frag_free(xdp_pkt); /* Free xdp_pkt */ +		} +		processed++; +	} +	bq->count = 0; +	spin_unlock(&q->producer_lock); + +	/* Feedback loop via tracepoints */ +	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); +	return 0; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ +	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + +	if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) +		bq_flush_to_queue(rcpu, bq); + +	/* Notice, xdp_buff/page MUST be queued here, long enough for +	 * driver to code invoking us to finished, due to driver +	 * (e.g. ixgbe) recycle tricks based on page-refcnt. +	 * +	 * Thus, incoming xdp_pkt is always queued here (else we race +	 * with another CPU on page-refcnt and remaining driver code). +	 * Queue time is very short, as driver will invoke flush +	 * operation, when completing napi->poll call. +	 */ +	bq->q[bq->count++] = xdp_pkt; +	return 0; +} + +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +		    struct net_device *dev_rx) +{ +	struct xdp_pkt *xdp_pkt; + +	xdp_pkt = convert_to_xdp_pkt(xdp); +	if (unlikely(!xdp_pkt)) +		return -EOVERFLOW; + +	/* Info needed when constructing SKB on remote CPU */ +	xdp_pkt->dev_rx = dev_rx; + +	bq_enqueue(rcpu, xdp_pkt); +	return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + +	__set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); +	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); +	u32 bit; + +	/* The napi->poll softirq makes sure __cpu_map_insert_ctx() +	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu +	 * bitmap indicate which percpu bulkq have packets. +	 */ +	for_each_set_bit(bit, bitmap, map->max_entries) { +		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); +		struct xdp_bulk_queue *bq; + +		/* This is possible if entry is removed by user space +		 * between xdp redirect and flush op. +		 */ +		if (unlikely(!rcpu)) +			continue; + +		__clear_bit(bit, bitmap); + +		/* Flush all frames in bulkq to real queue */ +		bq = this_cpu_ptr(rcpu->bulkq); +		bq_flush_to_queue(rcpu, bq); + +		/* If already running, costs spin_lock_irqsave + smb_mb */ +		wake_up_process(rcpu->kthread); +	} +} diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..ebdef54bf7df 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -50,6 +50,9 @@  #include <linux/bpf.h>  #include <linux/filter.h> +#define DEV_CREATE_FLAG_MASK \ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +  struct bpf_dtab_netdev {  	struct net_device *dev;  	struct bpf_dtab *dtab; @@ -69,7 +72,7 @@ static LIST_HEAD(dev_map_list);  static u64 dev_map_bitmap_size(const union bpf_attr *attr)  { -	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);  }  static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -78,9 +81,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	int err = -EINVAL;  	u64 cost; +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM); +  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || -	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)  		return ERR_PTR(-EINVAL);  	dtab = kzalloc(sizeof(*dtab), GFP_USER); @@ -111,8 +117,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	err = -ENOMEM;  	/* A per cpu bitfield with a bit per possible net device */ -	dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), -					    __alignof__(unsigned long)); +	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), +						__alignof__(unsigned long), +						GFP_KERNEL | __GFP_NOWARN);  	if (!dtab->flush_needed)  		goto free_dtab; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { +	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ +	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + +	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) +		return func_id_str[id]; +	else +		return "unknown"; +} + +const char *const bpf_class_string[8] = { +	[BPF_LD]    = "ld", +	[BPF_LDX]   = "ldx", +	[BPF_ST]    = "st", +	[BPF_STX]   = "stx", +	[BPF_ALU]   = "alu", +	[BPF_JMP]   = "jmp", +	[BPF_RET]   = "BUG", +	[BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { +	[BPF_ADD >> 4]  = "+=", +	[BPF_SUB >> 4]  = "-=", +	[BPF_MUL >> 4]  = "*=", +	[BPF_DIV >> 4]  = "/=", +	[BPF_OR  >> 4]  = "|=", +	[BPF_AND >> 4]  = "&=", +	[BPF_LSH >> 4]  = "<<=", +	[BPF_RSH >> 4]  = ">>=", +	[BPF_NEG >> 4]  = "neg", +	[BPF_MOD >> 4]  = "%=", +	[BPF_XOR >> 4]  = "^=", +	[BPF_MOV >> 4]  = "=", +	[BPF_ARSH >> 4] = "s>>=", +	[BPF_END >> 4]  = "endian", +}; + +static const char *const bpf_ldst_string[] = { +	[BPF_W >> 3]  = "u32", +	[BPF_H >> 3]  = "u16", +	[BPF_B >> 3]  = "u8", +	[BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { +	[BPF_JA >> 4]   = "jmp", +	[BPF_JEQ >> 4]  = "==", +	[BPF_JGT >> 4]  = ">", +	[BPF_JLT >> 4]  = "<", +	[BPF_JGE >> 4]  = ">=", +	[BPF_JLE >> 4]  = "<=", +	[BPF_JSET >> 4] = "&", +	[BPF_JNE >> 4]  = "!=", +	[BPF_JSGT >> 4] = "s>", +	[BPF_JSLT >> 4] = "s<", +	[BPF_JSGE >> 4] = "s>=", +	[BPF_JSLE >> 4] = "s<=", +	[BPF_CALL >> 4] = "call", +	[BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, +			       struct bpf_verifier_env *env, +			       const struct bpf_insn *insn) +{ +	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, +		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", +		insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, +		    const struct bpf_insn *insn, bool allow_ptr_leaks) +{ +	u8 class = BPF_CLASS(insn->code); + +	if (class == BPF_ALU || class == BPF_ALU64) { +		if (BPF_OP(insn->code) == BPF_END) { +			if (class == BPF_ALU64) +				verbose(env, "BUG_alu64_%02x\n", insn->code); +			else +				print_bpf_end_insn(verbose, env, insn); +		} else if (BPF_OP(insn->code) == BPF_NEG) { +			verbose(env, "(%02x) r%d = %s-r%d\n", +				insn->code, insn->dst_reg, +				class == BPF_ALU ? "(u32) " : "", +				insn->dst_reg); +		} else if (BPF_SRC(insn->code) == BPF_X) { +			verbose(env, "(%02x) %sr%d %s %sr%d\n", +				insn->code, class == BPF_ALU ? "(u32) " : "", +				insn->dst_reg, +				bpf_alu_string[BPF_OP(insn->code) >> 4], +				class == BPF_ALU ? "(u32) " : "", +				insn->src_reg); +		} else { +			verbose(env, "(%02x) %sr%d %s %s%d\n", +				insn->code, class == BPF_ALU ? "(u32) " : "", +				insn->dst_reg, +				bpf_alu_string[BPF_OP(insn->code) >> 4], +				class == BPF_ALU ? "(u32) " : "", +				insn->imm); +		} +	} else if (class == BPF_STX) { +		if (BPF_MODE(insn->code) == BPF_MEM) +			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", +				insn->code, +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +				insn->dst_reg, +				insn->off, insn->src_reg); +		else if (BPF_MODE(insn->code) == BPF_XADD) +			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", +				insn->code, +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +				insn->dst_reg, insn->off, +				insn->src_reg); +		else +			verbose(env, "BUG_%02x\n", insn->code); +	} else if (class == BPF_ST) { +		if (BPF_MODE(insn->code) != BPF_MEM) { +			verbose(env, "BUG_st_%02x\n", insn->code); +			return; +		} +		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", +			insn->code, +			bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +			insn->dst_reg, +			insn->off, insn->imm); +	} else if (class == BPF_LDX) { +		if (BPF_MODE(insn->code) != BPF_MEM) { +			verbose(env, "BUG_ldx_%02x\n", insn->code); +			return; +		} +		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", +			insn->code, insn->dst_reg, +			bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +			insn->src_reg, insn->off); +	} else if (class == BPF_LD) { +		if (BPF_MODE(insn->code) == BPF_ABS) { +			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", +				insn->code, +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +				insn->imm); +		} else if (BPF_MODE(insn->code) == BPF_IND) { +			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", +				insn->code, +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +				insn->src_reg, insn->imm); +		} else if (BPF_MODE(insn->code) == BPF_IMM && +			   BPF_SIZE(insn->code) == BPF_DW) { +			/* At this point, we already made sure that the second +			 * part of the ldimm64 insn is accessible. +			 */ +			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; +			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + +			if (map_ptr && !allow_ptr_leaks) +				imm = 0; + +			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, +				insn->dst_reg, (unsigned long long)imm); +		} else { +			verbose(env, "BUG_ld_%02x\n", insn->code); +			return; +		} +	} else if (class == BPF_JMP) { +		u8 opcode = BPF_OP(insn->code); + +		if (opcode == BPF_CALL) { +			verbose(env, "(%02x) call %s#%d\n", insn->code, +				func_id_name(insn->imm), insn->imm); +		} else if (insn->code == (BPF_JMP | BPF_JA)) { +			verbose(env, "(%02x) goto pc%+d\n", +				insn->code, insn->off); +		} else if (insn->code == (BPF_JMP | BPF_EXIT)) { +			verbose(env, "(%02x) exit\n", insn->code); +		} else if (BPF_SRC(insn->code) == BPF_X) { +			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", +				insn->code, insn->dst_reg, +				bpf_jmp_string[BPF_OP(insn->code) >> 4], +				insn->src_reg, insn->off); +		} else { +			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", +				insn->code, insn->dst_reg, +				bpf_jmp_string[BPF_OP(insn->code) >> 4], +				insn->imm, insn->off); +		} +	} else { +		verbose(env, "(%02x) %s\n", +			insn->code, bpf_class_string[class]); +	} +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include <linux/bpf.h> +#include <linux/kernel.h> +#include <linux/stringify.h> + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, +				  const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, +		    const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..e469e05c8e83 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,8 +18,9 @@  #include "bpf_lru_list.h"  #include "map_in_map.h" -#define HTAB_CREATE_FLAG_MASK \ -	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) +#define HTAB_CREATE_FLAG_MASK						\ +	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\ +	 BPF_F_RDONLY | BPF_F_WRONLY)  struct bucket {  	struct hlist_nulls_head head; @@ -317,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		 */  		goto free_htab; -	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) -		/* make sure the size for pcpu_alloc() is reasonable */ -		goto free_htab; -  	htab->elem_size = sizeof(struct htab_elem) +  			  round_up(htab->map.key_size, 8);  	if (percpu) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..01aaef1a77c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,7 +295,7 @@ out:  }  static void *bpf_obj_do_get(const struct filename *pathname, -			    enum bpf_type *type) +			    enum bpf_type *type, int flags)  {  	struct inode *inode;  	struct path path; @@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,  		return ERR_PTR(ret);  	inode = d_backing_inode(path.dentry); -	ret = inode_permission(inode, MAY_WRITE); +	ret = inode_permission(inode, ACC_MODE(flags));  	if (ret)  		goto out; @@ -326,18 +326,23 @@ out:  	return ERR_PTR(ret);  } -int bpf_obj_get_user(const char __user *pathname) +int bpf_obj_get_user(const char __user *pathname, int flags)  {  	enum bpf_type type = BPF_TYPE_UNSPEC;  	struct filename *pname;  	int ret = -ENOENT; +	int f_flags;  	void *raw; +	f_flags = bpf_get_file_flag(flags); +	if (f_flags < 0) +		return f_flags; +  	pname = getname(pathname);  	if (IS_ERR(pname))  		return PTR_ERR(pname); -	raw = bpf_obj_do_get(pname, &type); +	raw = bpf_obj_do_get(pname, &type, f_flags);  	if (IS_ERR(raw)) {  		ret = PTR_ERR(raw);  		goto out; @@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)  	if (type == BPF_TYPE_PROG)  		ret = bpf_prog_new_fd(raw);  	else if (type == BPF_TYPE_MAP) -		ret = bpf_map_new_fd(raw); +		ret = bpf_map_new_fd(raw, f_flags);  	else  		goto out; @@ -363,6 +368,7 @@ out:  	putname(pname);  	return ret;  } +EXPORT_SYMBOL_GPL(bpf_obj_get_user);  static void bpf_evict_inode(struct inode *inode)  { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..885e45479680 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,99 @@ out:  	return ret;  } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key)  { -	/* TODO */ -	return -ENOSYS; +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +	struct bpf_lpm_trie_key *key = _key; +	struct lpm_trie_node __rcu **trim, **trim2; +	struct lpm_trie_node *node, *parent; +	unsigned long irq_flags; +	unsigned int next_bit; +	size_t matchlen = 0; +	int ret = 0; + +	if (key->prefixlen > trie->max_prefixlen) +		return -EINVAL; + +	raw_spin_lock_irqsave(&trie->lock, irq_flags); + +	/* Walk the tree looking for an exact key/length match and keeping +	 * track of the path we traverse.  We will need to know the node +	 * we wish to delete, and the slot that points to the node we want +	 * to delete.  We may also need to know the nodes parent and the +	 * slot that contains it. +	 */ +	trim = &trie->root; +	trim2 = trim; +	parent = NULL; +	while ((node = rcu_dereference_protected( +		       *trim, lockdep_is_held(&trie->lock)))) { +		matchlen = longest_prefix_match(trie, node, key); + +		if (node->prefixlen != matchlen || +		    node->prefixlen == key->prefixlen) +			break; + +		parent = node; +		trim2 = trim; +		next_bit = extract_bit(key->data, node->prefixlen); +		trim = &node->child[next_bit]; +	} + +	if (!node || node->prefixlen != key->prefixlen || +	    (node->flags & LPM_TREE_NODE_FLAG_IM)) { +		ret = -ENOENT; +		goto out; +	} + +	trie->n_entries--; + +	/* If the node we are removing has two children, simply mark it +	 * as intermediate and we are done. +	 */ +	if (rcu_access_pointer(node->child[0]) && +	    rcu_access_pointer(node->child[1])) { +		node->flags |= LPM_TREE_NODE_FLAG_IM; +		goto out; +	} + +	/* If the parent of the node we are about to delete is an intermediate +	 * node, and the deleted node doesn't have any children, we can delete +	 * the intermediate parent as well and promote its other child +	 * up the tree.  Doing this maintains the invariant that all +	 * intermediate nodes have exactly 2 children and that there are no +	 * unnecessary intermediate nodes in the tree. +	 */ +	if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && +	    !node->child[0] && !node->child[1]) { +		if (node == rcu_access_pointer(parent->child[0])) +			rcu_assign_pointer( +				*trim2, rcu_access_pointer(parent->child[1])); +		else +			rcu_assign_pointer( +				*trim2, rcu_access_pointer(parent->child[0])); +		kfree_rcu(parent, rcu); +		kfree_rcu(node, rcu); +		goto out; +	} + +	/* The node we are removing has either zero or one child. If there +	 * is a child, move it into the removed node's slot then delete +	 * the node.  Otherwise just clear the slot and delete the node. +	 */ +	if (node->child[0]) +		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); +	else if (node->child[1]) +		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); +	else +		RCU_INIT_POINTER(*trim, NULL); +	kfree_rcu(node, rcu); + +out: +	raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + +	return ret;  }  #define LPM_DATA_SIZE_MAX	256 @@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)  #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)  #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) -#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) +#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\ +				 BPF_F_RDONLY | BPF_F_WRONLY)  static struct bpf_map *trie_alloc(union bpf_attr *attr)  { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c new file mode 100644 index 000000000000..68ec884440b7 --- /dev/null +++ b/kernel/bpf/offload.c @@ -0,0 +1,191 @@ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/bug.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rtnetlink.h> + +/* protected by RTNL */ +static LIST_HEAD(bpf_prog_offload_devs); + +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +{ +	struct net *net = current->nsproxy->net_ns; +	struct bpf_dev_offload *offload; + +	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && +	    attr->prog_type != BPF_PROG_TYPE_XDP) +		return -EINVAL; + +	if (attr->prog_flags) +		return -EINVAL; + +	offload = kzalloc(sizeof(*offload), GFP_USER); +	if (!offload) +		return -ENOMEM; + +	offload->prog = prog; +	init_waitqueue_head(&offload->verifier_done); + +	rtnl_lock(); +	offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); +	if (!offload->netdev) { +		rtnl_unlock(); +		kfree(offload); +		return -EINVAL; +	} + +	prog->aux->offload = offload; +	list_add_tail(&offload->offloads, &bpf_prog_offload_devs); +	rtnl_unlock(); + +	return 0; +} + +static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, +			     struct netdev_bpf *data) +{ +	struct net_device *netdev = prog->aux->offload->netdev; + +	ASSERT_RTNL(); + +	if (!netdev) +		return -ENODEV; +	if (!netdev->netdev_ops->ndo_bpf) +		return -EOPNOTSUPP; + +	data->command = cmd; + +	return netdev->netdev_ops->ndo_bpf(netdev, data); +} + +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ +	struct netdev_bpf data = {}; +	int err; + +	data.verifier.prog = env->prog; + +	rtnl_lock(); +	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); +	if (err) +		goto exit_unlock; + +	env->dev_ops = data.verifier.ops; + +	env->prog->aux->offload->dev_state = true; +	env->prog->aux->offload->verifier_running = true; +exit_unlock: +	rtnl_unlock(); +	return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ +	struct bpf_dev_offload *offload = prog->aux->offload; +	struct netdev_bpf data = {}; + +	/* Caution - if netdev is destroyed before the program, this function +	 * will be called twice. +	 */ + +	data.offload.prog = prog; + +	if (offload->verifier_running) +		wait_event(offload->verifier_done, !offload->verifier_running); + +	if (offload->dev_state) +		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + +	offload->dev_state = false; +	list_del_init(&offload->offloads); +	offload->netdev = NULL; +} + +void bpf_prog_offload_destroy(struct bpf_prog *prog) +{ +	struct bpf_dev_offload *offload = prog->aux->offload; + +	offload->verifier_running = false; +	wake_up(&offload->verifier_done); + +	rtnl_lock(); +	__bpf_prog_offload_destroy(prog); +	rtnl_unlock(); + +	kfree(offload); +} + +static int bpf_prog_offload_translate(struct bpf_prog *prog) +{ +	struct bpf_dev_offload *offload = prog->aux->offload; +	struct netdev_bpf data = {}; +	int ret; + +	data.offload.prog = prog; + +	offload->verifier_running = false; +	wake_up(&offload->verifier_done); + +	rtnl_lock(); +	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); +	rtnl_unlock(); + +	return ret; +} + +static unsigned int bpf_prog_warn_on_exec(const void *ctx, +					  const struct bpf_insn *insn) +{ +	WARN(1, "attempt to execute device eBPF program on the host!"); +	return 0; +} + +int bpf_prog_offload_compile(struct bpf_prog *prog) +{ +	prog->bpf_func = bpf_prog_warn_on_exec; + +	return bpf_prog_offload_translate(prog); +} + +const struct bpf_prog_ops bpf_offload_prog_ops = { +}; + +static int bpf_offload_notification(struct notifier_block *notifier, +				    ulong event, void *ptr) +{ +	struct net_device *netdev = netdev_notifier_info_to_dev(ptr); +	struct bpf_dev_offload *offload, *tmp; + +	ASSERT_RTNL(); + +	switch (event) { +	case NETDEV_UNREGISTER: +		/* ignore namespace changes */ +		if (netdev->reg_state != NETREG_UNREGISTERING) +			break; + +		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, +					 offloads) { +			if (offload->netdev == netdev) +				__bpf_prog_offload_destroy(offload->prog); +		} +		break; +	default: +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block bpf_offload_notifier = { +	.notifier_call = bpf_offload_notification, +}; + +static int __init bpf_offload_init(void) +{ +	register_netdevice_notifier(&bpf_offload_notifier); +	return 0; +} + +subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 5c51d1985b51..673fa6fe2d73 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)  {  	struct pcpu_freelist_head *head;  	struct pcpu_freelist_node *node; +	unsigned long flags;  	int orig_cpu, cpu; +	local_irq_save(flags);  	orig_cpu = cpu = raw_smp_processor_id();  	while (1) {  		head = per_cpu_ptr(s->freelist, cpu); @@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)  		node = head->first;  		if (node) {  			head->first = node->next; -			raw_spin_unlock(&head->lock); +			raw_spin_unlock_irqrestore(&head->lock, flags);  			return node;  		}  		raw_spin_unlock(&head->lock);  		cpu = cpumask_next(cpu, cpu_possible_mask);  		if (cpu >= nr_cpu_ids)  			cpu = 0; -		if (cpu == orig_cpu) +		if (cpu == orig_cpu) { +			local_irq_restore(flags);  			return NULL; +		}  	}  } diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..5ee2e41893d9 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,10 @@  #include <linux/workqueue.h>  #include <linux/list.h>  #include <net/strparser.h> +#include <net/tcp.h> + +#define SOCK_CREATE_FLAG_MASK \ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)  struct bpf_stab {  	struct bpf_map map; @@ -92,21 +96,45 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)  	return rcu_dereference_sk_user_data(sk);  } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ +	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + +enum __sk_action { +	__SK_DROP = 0, +	__SK_PASS, +	__SK_REDIRECT, +}; +  static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)  {  	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);  	int rc;  	if (unlikely(!prog)) -		return SK_DROP; +		return __SK_DROP;  	skb_orphan(skb); +	/* We need to ensure that BPF metadata for maps is also cleared +	 * when we orphan the skb so that we don't have the possibility +	 * to reference a stale map. +	 */ +	TCP_SKB_CB(skb)->bpf.map = NULL;  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_pointers(skb); +	preempt_disable();  	rc = (*prog->bpf_func)(skb, prog->insnsi); +	preempt_enable();  	skb->sk = NULL; -	return rc; +	/* Moving return codes from UAPI namespace into internal namespace */ +	return rc == SK_PASS ? +		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : +		__SK_DROP;  }  static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) @@ -114,17 +142,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  	struct sock *sk;  	int rc; -	/* Because we use per cpu values to feed input from sock redirect -	 * in BPF program to do_sk_redirect_map() call we need to ensure we -	 * are not preempted. RCU read lock is not sufficient in this case -	 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. -	 */ -	preempt_disable();  	rc = smap_verdict_func(psock, skb);  	switch (rc) { -	case SK_REDIRECT: -		sk = do_sk_redirect_map(); -		preempt_enable(); +	case __SK_REDIRECT: +		sk = do_sk_redirect_map(skb);  		if (likely(sk)) {  			struct smap_psock *peer = smap_psock_sk(sk); @@ -139,10 +160,8 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  			}  		}  	/* Fall through and free skb otherwise */ -	case SK_DROP: +	case __SK_DROP:  	default: -		if (rc != SK_REDIRECT) -			preempt_enable();  		kfree_skb(skb);  	}  } @@ -369,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp,  	 * any socket yet.  	 */  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_pointers(skb);  	rc = (*prog->bpf_func)(skb, prog->insnsi);  	skb->sk = NULL;  	rcu_read_unlock(); @@ -487,9 +506,12 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  	int err = -EINVAL;  	u64 cost; +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM); +  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || -	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) +	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)  		return ERR_PTR(-EINVAL);  	if (attr->value_size > KMALLOC_MAX_SIZE) @@ -840,6 +862,12 @@ static int sock_map_update_elem(struct bpf_map *map,  		return -EINVAL;  	} +	if (skops.sk->sk_type != SOCK_STREAM || +	    skops.sk->sk_protocol != IPPROTO_TCP) { +		fput(socket->file); +		return -EOPNOTSUPP; +	} +  	err = sock_map_ctx_update_elem(&skops, map, key, flags);  	fput(socket->file);  	return err; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 135be433e9a0..a15bc636cc98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,9 @@  #include <linux/perf_event.h>  #include "percpu_freelist.h" +#define STACK_CREATE_FLAG_MASK \ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +  struct stack_map_bucket {  	struct pcpu_freelist_node fnode;  	u32 hash; @@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)  	if (!capable(CAP_SYS_ADMIN))  		return ERR_PTR(-EPERM); -	if (attr->map_flags & ~BPF_F_NUMA_NODE) +	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)  		return ERR_PTR(-EINVAL);  	/* check sanity of attributes */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..2c4cfeaa8d5e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@  #include <linux/version.h>  #include <linux/kernel.h>  #include <linux/idr.h> +#include <linux/cred.h> +#include <linux/timekeeping.h> +#include <linux/ctype.h>  #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \  			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -31,6 +34,8 @@  #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)  #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY) +  DEFINE_PER_CPU(int, bpf_prog_active);  static DEFINE_IDR(prog_idr);  static DEFINE_SPINLOCK(prog_idr_lock); @@ -207,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)  	struct bpf_map *map = container_of(work, struct bpf_map, work);  	bpf_map_uncharge_memlock(map); +	security_bpf_map_free(map);  	/* implementation dependent freeing */  	map->ops->map_free(map);  } @@ -291,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  }  #endif -static const struct file_operations bpf_map_fops = { +static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, +			      loff_t *ppos) +{ +	/* We need this handler such that alloc_file() enables +	 * f_mode with FMODE_CAN_READ. +	 */ +	return -EINVAL; +} + +static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, +			       size_t siz, loff_t *ppos) +{ +	/* We need this handler such that alloc_file() enables +	 * f_mode with FMODE_CAN_WRITE. +	 */ +	return -EINVAL; +} + +const struct file_operations bpf_map_fops = {  #ifdef CONFIG_PROC_FS  	.show_fdinfo	= bpf_map_show_fdinfo,  #endif  	.release	= bpf_map_release, +	.read		= bpf_dummy_read, +	.write		= bpf_dummy_write,  }; -int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map, int flags)  { +	int ret; + +	ret = security_bpf_map(map, OPEN_FMODE(flags)); +	if (ret < 0) +		return ret; +  	return anon_inode_getfd("bpf-map", &bpf_map_fops, map, -				O_RDWR | O_CLOEXEC); +				flags | O_CLOEXEC); +} + +int bpf_get_file_flag(int flags) +{ +	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) +		return -EINVAL; +	if (flags & BPF_F_RDONLY) +		return O_RDONLY; +	if (flags & BPF_F_WRONLY) +		return O_WRONLY; +	return O_RDWR;  }  /* helper macro to check that unused fields 'union bpf_attr' are zero */ @@ -312,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map)  		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \  		   sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD numa_node +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ +	const char *end = src + BPF_OBJ_NAME_LEN; + +	memset(dst, 0, BPF_OBJ_NAME_LEN); + +	/* Copy all isalnum() and '_' char */ +	while (src < end && *src) { +		if (!isalnum(*src) && *src != '_') +			return -EINVAL; +		*dst++ = *src++; +	} + +	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ +	if (src == end) +		return -EINVAL; + +	return 0; +} + +#define BPF_MAP_CREATE_LAST_FIELD map_name  /* called via syscall */  static int map_create(union bpf_attr *attr)  {  	int numa_node = bpf_map_attr_numa_node(attr);  	struct bpf_map *map; +	int f_flags;  	int err;  	err = CHECK_ATTR(BPF_MAP_CREATE);  	if (err)  		return -EINVAL; +	f_flags = bpf_get_file_flag(attr->map_flags); +	if (f_flags < 0) +		return f_flags; +  	if (numa_node != NUMA_NO_NODE &&  	    ((unsigned int)numa_node >= nr_node_ids ||  	     !node_online(numa_node))) @@ -334,18 +405,26 @@ static int map_create(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); +	err = bpf_obj_name_cpy(map->name, attr->map_name); +	if (err) +		goto free_map_nouncharge; +  	atomic_set(&map->refcnt, 1);  	atomic_set(&map->usercnt, 1); -	err = bpf_map_charge_memlock(map); +	err = security_bpf_map_alloc(map);  	if (err)  		goto free_map_nouncharge; +	err = bpf_map_charge_memlock(map); +	if (err) +		goto free_map_sec; +  	err = bpf_map_alloc_id(map);  	if (err)  		goto free_map; -	err = bpf_map_new_fd(map); +	err = bpf_map_new_fd(map, f_flags);  	if (err < 0) {  		/* failed to allocate fd.  		 * bpf_map_put() is needed because the above @@ -362,6 +441,8 @@ static int map_create(union bpf_attr *attr)  free_map:  	bpf_map_uncharge_memlock(map); +free_map_sec: +	security_bpf_map_free(map);  free_map_nouncharge:  	map->ops->map_free(map);  	return err; @@ -460,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); +	if (!(f.file->f_mode & FMODE_CAN_READ)) { +		err = -EPERM; +		goto err_put; +	} +  	key = memdup_user(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key); @@ -540,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); +	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +		err = -EPERM; +		goto err_put; +	} +  	key = memdup_user(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key); @@ -562,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)  	if (copy_from_user(value, uvalue, value_size) != 0)  		goto free_value; +	/* Need to create a kthread, thus must support schedule */ +	if (map->map_type == BPF_MAP_TYPE_CPUMAP) { +		err = map->ops->map_update_elem(map, key, value, attr->flags); +		goto out; +	} +  	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from  	 * inside bpf map update or delete otherwise deadlocks are possible  	 */ @@ -592,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)  	}  	__this_cpu_dec(bpf_prog_active);  	preempt_enable(); - +out:  	if (!err)  		trace_bpf_map_update_elem(map, ufd, key, value);  free_value: @@ -623,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); +	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +		err = -EPERM; +		goto err_put; +	} +  	key = memdup_user(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key); @@ -666,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); +	if (!(f.file->f_mode & FMODE_CAN_READ)) { +		err = -EPERM; +		goto err_put; +	} +  	if (ukey) {  		key = memdup_user(ukey, map->key_size);  		if (IS_ERR(key)) { @@ -703,9 +810,9 @@ err_put:  	return err;  } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ -	[_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ +	[_id] = & _name ## _prog_ops,  #define BPF_MAP_TYPE(_id, _ops)  #include <linux/bpf_types.h>  #undef BPF_PROG_TYPE @@ -717,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)  	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])  		return -EINVAL; -	prog->aux->ops = bpf_prog_types[type]; +	if (!bpf_prog_is_dev_bound(prog->aux)) +		prog->aux->ops = bpf_prog_types[type]; +	else +		prog->aux->ops = &bpf_offload_prog_ops;  	prog->type = type;  	return 0;  } @@ -820,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)  	free_used_maps(aux);  	bpf_prog_uncharge_memlock(aux->prog); +	security_bpf_prog_free(aux);  	bpf_prog_free(aux->prog);  } @@ -867,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)  }  #endif -static const struct file_operations bpf_prog_fops = { +const struct file_operations bpf_prog_fops = {  #ifdef CONFIG_PROC_FS  	.show_fdinfo	= bpf_prog_show_fdinfo,  #endif  	.release	= bpf_prog_release, +	.read		= bpf_dummy_read, +	.write		= bpf_dummy_write,  };  int bpf_prog_new_fd(struct bpf_prog *prog)  { +	int ret; + +	ret = security_bpf_prog(prog); +	if (ret < 0) +		return ret; +  	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,  				O_RDWR | O_CLOEXEC);  } @@ -938,7 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)  }  EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) +static bool bpf_prog_get_ok(struct bpf_prog *prog, +			    enum bpf_prog_type *attach_type, bool attach_drv) +{ +	/* not an attachment, just a refcount inc, always allow */ +	if (!attach_type) +		return true; + +	if (prog->type != *attach_type) +		return false; +	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) +		return false; + +	return true; +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, +				       bool attach_drv)  {  	struct fd f = fdget(ufd);  	struct bpf_prog *prog; @@ -946,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)  	prog = ____bpf_prog_get(f);  	if (IS_ERR(prog))  		return prog; -	if (type && prog->type != *type) { +	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {  		prog = ERR_PTR(-EINVAL);  		goto out;  	} @@ -959,21 +1094,22 @@ out:  struct bpf_prog *bpf_prog_get(u32 ufd)  { -	return __bpf_prog_get(ufd, NULL); +	return __bpf_prog_get(ufd, NULL, false);  } -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, +				       bool attach_drv)  { -	struct bpf_prog *prog = __bpf_prog_get(ufd, &type); +	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);  	if (!IS_ERR(prog))  		trace_bpf_prog_get_type(prog);  	return prog;  } -EXPORT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);  /* last field in 'union bpf_attr' used by this command */ -#define	BPF_PROG_LOAD_LAST_FIELD prog_flags +#define	BPF_PROG_LOAD_LAST_FIELD prog_ifindex  static int bpf_prog_load(union bpf_attr *attr)  { @@ -1015,10 +1151,14 @@ static int bpf_prog_load(union bpf_attr *attr)  	if (!prog)  		return -ENOMEM; -	err = bpf_prog_charge_memlock(prog); +	err = security_bpf_prog_alloc(prog->aux);  	if (err)  		goto free_prog_nouncharge; +	err = bpf_prog_charge_memlock(prog); +	if (err) +		goto free_prog_sec; +  	prog->len = attr->insn_cnt;  	err = -EFAULT; @@ -1032,11 +1172,22 @@ static int bpf_prog_load(union bpf_attr *attr)  	atomic_set(&prog->aux->refcnt, 1);  	prog->gpl_compatible = is_gpl ? 1 : 0; +	if (attr->prog_ifindex) { +		err = bpf_prog_offload_init(prog, attr); +		if (err) +			goto free_prog; +	} +  	/* find program type: socket_filter vs tracing_filter */  	err = find_prog_type(type, prog);  	if (err < 0)  		goto free_prog; +	prog->aux->load_time = ktime_get_boot_ns(); +	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); +	if (err) +		goto free_prog; +  	/* run eBPF verifier */  	err = bpf_check(&prog, attr);  	if (err < 0) @@ -1071,16 +1222,18 @@ free_used_maps:  	free_used_maps(prog->aux);  free_prog:  	bpf_prog_uncharge_memlock(prog); +free_prog_sec: +	security_bpf_prog_free(prog->aux);  free_prog_nouncharge:  	bpf_prog_free(prog);  	return err;  } -#define BPF_OBJ_LAST_FIELD bpf_fd +#define BPF_OBJ_LAST_FIELD file_flags  static int bpf_obj_pin(const union bpf_attr *attr)  { -	if (CHECK_ATTR(BPF_OBJ)) +	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)  		return -EINVAL;  	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); @@ -1088,10 +1241,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)  static int bpf_obj_get(const union bpf_attr *attr)  { -	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) +	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || +	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)  		return -EINVAL; -	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); +	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), +				attr->file_flags);  }  #ifdef CONFIG_CGROUP_BPF @@ -1132,6 +1287,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)  	return 0;  } +#define BPF_F_ATTACH_MASK \ +	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) +  static int bpf_prog_attach(const union bpf_attr *attr)  {  	enum bpf_prog_type ptype; @@ -1145,7 +1303,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)  	if (CHECK_ATTR(BPF_PROG_ATTACH))  		return -EINVAL; -	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) +	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)  		return -EINVAL;  	switch (attr->attach_type) { @@ -1159,6 +1317,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)  	case BPF_CGROUP_SOCK_OPS:  		ptype = BPF_PROG_TYPE_SOCK_OPS;  		break; +	case BPF_CGROUP_DEVICE: +		ptype = BPF_PROG_TYPE_CGROUP_DEVICE; +		break;  	case BPF_SK_SKB_STREAM_PARSER:  	case BPF_SK_SKB_STREAM_VERDICT:  		return sockmap_get_from_fd(attr, true); @@ -1176,8 +1337,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)  		return PTR_ERR(cgrp);  	} -	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, -				attr->attach_flags & BPF_F_ALLOW_OVERRIDE); +	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, +				attr->attach_flags);  	if (ret)  		bpf_prog_put(prog);  	cgroup_put(cgrp); @@ -1189,6 +1350,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)  static int bpf_prog_detach(const union bpf_attr *attr)  { +	enum bpf_prog_type ptype; +	struct bpf_prog *prog;  	struct cgroup *cgrp;  	int ret; @@ -1201,26 +1364,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)  	switch (attr->attach_type) {  	case BPF_CGROUP_INET_INGRESS:  	case BPF_CGROUP_INET_EGRESS: +		ptype = BPF_PROG_TYPE_CGROUP_SKB; +		break;  	case BPF_CGROUP_INET_SOCK_CREATE: +		ptype = BPF_PROG_TYPE_CGROUP_SOCK; +		break;  	case BPF_CGROUP_SOCK_OPS: -		cgrp = cgroup_get_from_fd(attr->target_fd); -		if (IS_ERR(cgrp)) -			return PTR_ERR(cgrp); - -		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); -		cgroup_put(cgrp); +		ptype = BPF_PROG_TYPE_SOCK_OPS; +		break; +	case BPF_CGROUP_DEVICE: +		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;  		break;  	case BPF_SK_SKB_STREAM_PARSER:  	case BPF_SK_SKB_STREAM_VERDICT: -		ret = sockmap_get_from_fd(attr, false); -		break; +		return sockmap_get_from_fd(attr, false);  	default:  		return -EINVAL;  	} +	cgrp = cgroup_get_from_fd(attr->target_fd); +	if (IS_ERR(cgrp)) +		return PTR_ERR(cgrp); + +	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); +	if (IS_ERR(prog)) +		prog = NULL; + +	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); +	if (prog) +		bpf_prog_put(prog); +	cgroup_put(cgrp);  	return ret;  } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, +			  union bpf_attr __user *uattr) +{ +	struct cgroup *cgrp; +	int ret; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +	if (CHECK_ATTR(BPF_PROG_QUERY)) +		return -EINVAL; +	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) +		return -EINVAL; + +	switch (attr->query.attach_type) { +	case BPF_CGROUP_INET_INGRESS: +	case BPF_CGROUP_INET_EGRESS: +	case BPF_CGROUP_INET_SOCK_CREATE: +	case BPF_CGROUP_SOCK_OPS: +	case BPF_CGROUP_DEVICE: +		break; +	default: +		return -EINVAL; +	} +	cgrp = cgroup_get_from_fd(attr->query.target_fd); +	if (IS_ERR(cgrp)) +		return PTR_ERR(cgrp); +	ret = cgroup_bpf_query(cgrp, attr, uattr); +	cgroup_put(cgrp); +	return ret; +}  #endif /* CONFIG_CGROUP_BPF */  #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1305,20 +1513,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)  	return fd;  } -#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags  static int bpf_map_get_fd_by_id(const union bpf_attr *attr)  {  	struct bpf_map *map;  	u32 id = attr->map_id; +	int f_flags;  	int fd; -	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) +	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || +	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)  		return -EINVAL;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; +	f_flags = bpf_get_file_flag(attr->open_flags); +	if (f_flags < 0) +		return f_flags; +  	spin_lock_bh(&map_idr_lock);  	map = idr_find(&map_idr, id);  	if (map) @@ -1330,7 +1544,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); -	fd = bpf_map_new_fd(map); +	fd = bpf_map_new_fd(map, f_flags);  	if (fd < 0)  		bpf_map_put(map); @@ -1358,8 +1572,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,  	info.type = prog->type;  	info.id = prog->aux->id; +	info.load_time = prog->aux->load_time; +	info.created_by_uid = from_kuid_munged(current_user_ns(), +					       prog->aux->user->uid);  	memcpy(info.tag, prog->tag, sizeof(prog->tag)); +	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + +	ulen = info.nr_map_ids; +	info.nr_map_ids = prog->aux->used_map_cnt; +	ulen = min_t(u32, info.nr_map_ids, ulen); +	if (ulen) { +		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); +		u32 i; + +		for (i = 0; i < ulen; i++) +			if (put_user(prog->aux->used_maps[i]->id, +				     &user_map_ids[i])) +				return -EFAULT; +	}  	if (!capable(CAP_SYS_ADMIN)) {  		info.jited_prog_len = 0; @@ -1413,6 +1644,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,  	info.value_size = map->value_size;  	info.max_entries = map->max_entries;  	info.map_flags = map->map_flags; +	memcpy(info.name, map->name, sizeof(map->name));  	if (copy_to_user(uinfo, &info, info_len) ||  	    put_user(info_len, &uattr->info.info_len)) @@ -1467,6 +1699,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	if (copy_from_user(&attr, uattr, size) != 0)  		return -EFAULT; +	err = security_bpf(cmd, &attr, size); +	if (err < 0) +		return err; +  	switch (cmd) {  	case BPF_MAP_CREATE:  		err = map_create(&attr); @@ -1499,6 +1735,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	case BPF_PROG_DETACH:  		err = bpf_prog_detach(&attr);  		break; +	case BPF_PROG_QUERY: +		err = bpf_prog_query(&attr, uattr); +		break;  #endif  	case BPF_PROG_TEST_RUN:  		err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..d4593571c404 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,17 @@  #include <linux/vmalloc.h>  #include <linux/stringify.h> +#include "disasm.h" + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ +	[_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; +  /* bpf_check() is a static code analyzer that walks eBPF program   * instruction by instruction and updates register/stack state.   * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -153,28 +164,42 @@ struct bpf_call_arg_meta {  	int access_size;  }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static u32 log_level, log_size, log_len; -static char *log_buf; -  static DEFINE_MUTEX(bpf_verifier_lock);  /* log_level controls verbosity level of eBPF verifier.   * verbose() is used to dump the verification trace to the log, so the user   * can figure out what's wrong with the program   */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, +				   const char *fmt, ...)  { +	struct bpf_verifer_log *log = &env->log; +	unsigned int n;  	va_list args; -	if (log_level == 0 || log_len >= log_size - 1) +	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))  		return;  	va_start(args, fmt); -	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); +	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);  	va_end(args); + +	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, +		  "verifier log line truncated - local buffer too short\n"); + +	n = min(log->len_total - log->len_used - 1, n); +	log->kbuf[n] = '\0'; + +	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) +		log->len_used += n; +	else +		log->ubuf = NULL; +} + +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ +	return type == PTR_TO_PACKET || +	       type == PTR_TO_PACKET_META;  }  /* string representation of 'enum bpf_reg_type' */ @@ -187,26 +212,12 @@ static const char * const reg_type_str[] = {  	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",  	[PTR_TO_STACK]		= "fp",  	[PTR_TO_PACKET]		= "pkt", +	[PTR_TO_PACKET_META]	= "pkt_meta",  	[PTR_TO_PACKET_END]	= "pkt_end",  }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { -	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ -	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - -	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) -		return func_id_str[id]; -	else -		return "unknown"; -} - -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, +				 struct bpf_verifier_state *state)  {  	struct bpf_reg_state *reg;  	enum bpf_reg_type t; @@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)  		t = reg->type;  		if (t == NOT_INIT)  			continue; -		verbose(" R%d=%s", i, reg_type_str[t]); +		verbose(env, " R%d=%s", i, reg_type_str[t]);  		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&  		    tnum_is_const(reg->var_off)) {  			/* reg->off should be 0 for SCALAR_VALUE */ -			verbose("%lld", reg->var_off.value + reg->off); +			verbose(env, "%lld", reg->var_off.value + reg->off);  		} else { -			verbose("(id=%d", reg->id); +			verbose(env, "(id=%d", reg->id);  			if (t != SCALAR_VALUE) -				verbose(",off=%d", reg->off); -			if (t == PTR_TO_PACKET) -				verbose(",r=%d", reg->range); +				verbose(env, ",off=%d", reg->off); +			if (type_is_pkt_pointer(t)) +				verbose(env, ",r=%d", reg->range);  			else if (t == CONST_PTR_TO_MAP ||  				 t == PTR_TO_MAP_VALUE ||  				 t == PTR_TO_MAP_VALUE_OR_NULL) -				verbose(",ks=%d,vs=%d", +				verbose(env, ",ks=%d,vs=%d",  					reg->map_ptr->key_size,  					reg->map_ptr->value_size);  			if (tnum_is_const(reg->var_off)) { @@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state)  				 * could be a pointer whose offset is too big  				 * for reg->off  				 */ -				verbose(",imm=%llx", reg->var_off.value); +				verbose(env, ",imm=%llx", reg->var_off.value);  			} else {  				if (reg->smin_value != reg->umin_value &&  				    reg->smin_value != S64_MIN) -					verbose(",smin_value=%lld", +					verbose(env, ",smin_value=%lld",  						(long long)reg->smin_value);  				if (reg->smax_value != reg->umax_value &&  				    reg->smax_value != S64_MAX) -					verbose(",smax_value=%lld", +					verbose(env, ",smax_value=%lld",  						(long long)reg->smax_value);  				if (reg->umin_value != 0) -					verbose(",umin_value=%llu", +					verbose(env, ",umin_value=%llu",  						(unsigned long long)reg->umin_value);  				if (reg->umax_value != U64_MAX) -					verbose(",umax_value=%llu", +					verbose(env, ",umax_value=%llu",  						(unsigned long long)reg->umax_value);  				if (!tnum_is_unknown(reg->var_off)) {  					char tn_buf[48];  					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -					verbose(",var_off=%s", tn_buf); +					verbose(env, ",var_off=%s", tn_buf);  				}  			} -			verbose(")"); +			verbose(env, ")");  		}  	} -	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { -		if (state->stack_slot_type[i] == STACK_SPILL) -			verbose(" fp%d=%s", -MAX_BPF_STACK + i, -				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); +	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +		if (state->stack[i].slot_type[0] == STACK_SPILL) +			verbose(env, " fp%d=%s", +				-MAX_BPF_STACK + i * BPF_REG_SIZE, +				reg_type_str[state->stack[i].spilled_ptr.type]);  	} -	verbose("\n"); +	verbose(env, "\n");  } -static const char *const bpf_class_string[] = { -	[BPF_LD]    = "ld", -	[BPF_LDX]   = "ldx", -	[BPF_ST]    = "st", -	[BPF_STX]   = "stx", -	[BPF_ALU]   = "alu", -	[BPF_JMP]   = "jmp", -	[BPF_RET]   = "BUG", -	[BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { -	[BPF_ADD >> 4]  = "+=", -	[BPF_SUB >> 4]  = "-=", -	[BPF_MUL >> 4]  = "*=", -	[BPF_DIV >> 4]  = "/=", -	[BPF_OR  >> 4]  = "|=", -	[BPF_AND >> 4]  = "&=", -	[BPF_LSH >> 4]  = "<<=", -	[BPF_RSH >> 4]  = ">>=", -	[BPF_NEG >> 4]  = "neg", -	[BPF_MOD >> 4]  = "%=", -	[BPF_XOR >> 4]  = "^=", -	[BPF_MOV >> 4]  = "=", -	[BPF_ARSH >> 4] = "s>>=", -	[BPF_END >> 4]  = "endian", -}; - -static const char *const bpf_ldst_string[] = { -	[BPF_W >> 3]  = "u32", -	[BPF_H >> 3]  = "u16", -	[BPF_B >> 3]  = "u8", -	[BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { -	[BPF_JA >> 4]   = "jmp", -	[BPF_JEQ >> 4]  = "==", -	[BPF_JGT >> 4]  = ">", -	[BPF_JLT >> 4]  = "<", -	[BPF_JGE >> 4]  = ">=", -	[BPF_JLE >> 4]  = "<=", -	[BPF_JSET >> 4] = "&", -	[BPF_JNE >> 4]  = "!=", -	[BPF_JSGT >> 4] = "s>", -	[BPF_JSLT >> 4] = "s<", -	[BPF_JSGE >> 4] = "s>=", -	[BPF_JSLE >> 4] = "s<=", -	[BPF_CALL >> 4] = "call", -	[BPF_EXIT >> 4] = "exit", -}; +static int copy_stack_state(struct bpf_verifier_state *dst, +			    const struct bpf_verifier_state *src) +{ +	if (!src->stack) +		return 0; +	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { +		/* internal bug, make state invalid to reject the program */ +		memset(dst, 0, sizeof(*dst)); +		return -EFAULT; +	} +	memcpy(dst->stack, src->stack, +	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); +	return 0; +} -static void print_bpf_insn(const struct bpf_verifier_env *env, -			   const struct bpf_insn *insn) +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, +				  bool copy_old)  { -	u8 class = BPF_CLASS(insn->code); - -	if (class == BPF_ALU || class == BPF_ALU64) { -		if (BPF_SRC(insn->code) == BPF_X) -			verbose("(%02x) %sr%d %s %sr%d\n", -				insn->code, class == BPF_ALU ? "(u32) " : "", -				insn->dst_reg, -				bpf_alu_string[BPF_OP(insn->code) >> 4], -				class == BPF_ALU ? "(u32) " : "", -				insn->src_reg); -		else -			verbose("(%02x) %sr%d %s %s%d\n", -				insn->code, class == BPF_ALU ? "(u32) " : "", -				insn->dst_reg, -				bpf_alu_string[BPF_OP(insn->code) >> 4], -				class == BPF_ALU ? "(u32) " : "", -				insn->imm); -	} else if (class == BPF_STX) { -		if (BPF_MODE(insn->code) == BPF_MEM) -			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", -				insn->code, -				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -				insn->dst_reg, -				insn->off, insn->src_reg); -		else if (BPF_MODE(insn->code) == BPF_XADD) -			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", -				insn->code, -				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -				insn->dst_reg, insn->off, -				insn->src_reg); -		else -			verbose("BUG_%02x\n", insn->code); -	} else if (class == BPF_ST) { -		if (BPF_MODE(insn->code) != BPF_MEM) { -			verbose("BUG_st_%02x\n", insn->code); -			return; -		} -		verbose("(%02x) *(%s *)(r%d %+d) = %d\n", -			insn->code, -			bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -			insn->dst_reg, -			insn->off, insn->imm); -	} else if (class == BPF_LDX) { -		if (BPF_MODE(insn->code) != BPF_MEM) { -			verbose("BUG_ldx_%02x\n", insn->code); -			return; +	u32 old_size = state->allocated_stack; +	struct bpf_stack_state *new_stack; +	int slot = size / BPF_REG_SIZE; + +	if (size <= old_size || !size) { +		if (copy_old) +			return 0; +		state->allocated_stack = slot * BPF_REG_SIZE; +		if (!size && old_size) { +			kfree(state->stack); +			state->stack = NULL;  		} -		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", -			insn->code, insn->dst_reg, -			bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -			insn->src_reg, insn->off); -	} else if (class == BPF_LD) { -		if (BPF_MODE(insn->code) == BPF_ABS) { -			verbose("(%02x) r0 = *(%s *)skb[%d]\n", -				insn->code, -				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -				insn->imm); -		} else if (BPF_MODE(insn->code) == BPF_IND) { -			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", -				insn->code, -				bpf_ldst_string[BPF_SIZE(insn->code) >> 3], -				insn->src_reg, insn->imm); -		} else if (BPF_MODE(insn->code) == BPF_IMM && -			   BPF_SIZE(insn->code) == BPF_DW) { -			/* At this point, we already made sure that the second -			 * part of the ldimm64 insn is accessible. -			 */ -			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; -			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; +		return 0; +	} +	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), +				  GFP_KERNEL); +	if (!new_stack) +		return -ENOMEM; +	if (copy_old) { +		if (state->stack) +			memcpy(new_stack, state->stack, +			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); +		memset(new_stack + old_size / BPF_REG_SIZE, 0, +		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); +	} +	state->allocated_stack = slot * BPF_REG_SIZE; +	kfree(state->stack); +	state->stack = new_stack; +	return 0; +} -			if (map_ptr && !env->allow_ptr_leaks) -				imm = 0; +static void free_verifier_state(struct bpf_verifier_state *state, +				bool free_self) +{ +	kfree(state->stack); +	if (free_self) +		kfree(state); +} -			verbose("(%02x) r%d = 0x%llx\n", insn->code, -				insn->dst_reg, (unsigned long long)imm); -		} else { -			verbose("BUG_ld_%02x\n", insn->code); -			return; -		} -	} else if (class == BPF_JMP) { -		u8 opcode = BPF_OP(insn->code); +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, +			       const struct bpf_verifier_state *src) +{ +	int err; -		if (opcode == BPF_CALL) { -			verbose("(%02x) call %s#%d\n", insn->code, -				func_id_name(insn->imm), insn->imm); -		} else if (insn->code == (BPF_JMP | BPF_JA)) { -			verbose("(%02x) goto pc%+d\n", -				insn->code, insn->off); -		} else if (insn->code == (BPF_JMP | BPF_EXIT)) { -			verbose("(%02x) exit\n", insn->code); -		} else if (BPF_SRC(insn->code) == BPF_X) { -			verbose("(%02x) if r%d %s r%d goto pc%+d\n", -				insn->code, insn->dst_reg, -				bpf_jmp_string[BPF_OP(insn->code) >> 4], -				insn->src_reg, insn->off); -		} else { -			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", -				insn->code, insn->dst_reg, -				bpf_jmp_string[BPF_OP(insn->code) >> 4], -				insn->imm, insn->off); -		} -	} else { -		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); -	} +	err = realloc_verifier_state(dst, src->allocated_stack, false); +	if (err) +		return err; +	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); +	return copy_stack_state(dst, src);  } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, +		     int *insn_idx)  { -	struct bpf_verifier_stack_elem *elem; -	int insn_idx; +	struct bpf_verifier_state *cur = env->cur_state; +	struct bpf_verifier_stack_elem *elem, *head = env->head; +	int err;  	if (env->head == NULL) -		return -1; +		return -ENOENT; -	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); -	insn_idx = env->head->insn_idx; +	if (cur) { +		err = copy_verifier_state(cur, &head->st); +		if (err) +			return err; +	} +	if (insn_idx) +		*insn_idx = head->insn_idx;  	if (prev_insn_idx) -		*prev_insn_idx = env->head->prev_insn_idx; -	elem = env->head->next; -	kfree(env->head); +		*prev_insn_idx = head->prev_insn_idx; +	elem = head->next; +	free_verifier_state(&head->st, false); +	kfree(head);  	env->head = elem;  	env->stack_size--; -	return insn_idx; +	return 0;  }  static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,  					     int insn_idx, int prev_insn_idx)  { +	struct bpf_verifier_state *cur = env->cur_state;  	struct bpf_verifier_stack_elem *elem; +	int err; -	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); +	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);  	if (!elem)  		goto err; -	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));  	elem->insn_idx = insn_idx;  	elem->prev_insn_idx = prev_insn_idx;  	elem->next = env->head;  	env->head = elem;  	env->stack_size++; +	err = copy_verifier_state(&elem->st, cur); +	if (err) +		goto err;  	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { -		verbose("BPF program is too complex\n"); +		verbose(env, "BPF program is too complex\n");  		goto err;  	}  	return &elem->st;  err:  	/* pop all elements and return */ -	while (pop_stack(env, NULL) >= 0); +	while (!pop_stack(env, NULL, NULL));  	return NULL;  } @@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)  	__mark_reg_known(reg, 0);  } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, +				struct bpf_reg_state *regs, u32 regno)  {  	if (WARN_ON(regno >= MAX_BPF_REG)) { -		verbose("mark_reg_known_zero(regs, %u)\n", regno); +		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);  		/* Something bad happened, let's kill all regs */  		for (regno = 0; regno < MAX_BPF_REG; regno++)  			__mark_reg_not_init(regs + regno); @@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)  	__mark_reg_known_zero(regs + regno);  } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ +	return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ +	return reg_is_pkt_pointer(reg) || +	       reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, +				    enum bpf_reg_type which) +{ +	/* The register can already have a range from prior markings. +	 * This is fine as long as it hasn't been advanced from its +	 * origin. +	 */ +	return reg->type == which && +	       reg->id == 0 && +	       reg->off == 0 && +	       tnum_equals_const(reg->var_off, 0); +} +  /* Attempts to improve min/max values based on var_off information */  static void __update_reg_bounds(struct bpf_reg_state *reg)  { @@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)  	__mark_reg_unbounded(reg);  } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, +			     struct bpf_reg_state *regs, u32 regno)  {  	if (WARN_ON(regno >= MAX_BPF_REG)) { -		verbose("mark_reg_unknown(regs, %u)\n", regno); +		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);  		/* Something bad happened, let's kill all regs */  		for (regno = 0; regno < MAX_BPF_REG; regno++)  			__mark_reg_not_init(regs + regno); @@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)  	reg->type = NOT_INIT;  } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, +			      struct bpf_reg_state *regs, u32 regno)  {  	if (WARN_ON(regno >= MAX_BPF_REG)) { -		verbose("mark_reg_not_init(regs, %u)\n", regno); +		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);  		/* Something bad happened, let's kill all regs */  		for (regno = 0; regno < MAX_BPF_REG; regno++)  			__mark_reg_not_init(regs + regno); @@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)  	__mark_reg_not_init(regs + regno);  } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, +			   struct bpf_reg_state *regs)  {  	int i;  	for (i = 0; i < MAX_BPF_REG; i++) { -		mark_reg_not_init(regs, i); +		mark_reg_not_init(env, regs, i);  		regs[i].live = REG_LIVE_NONE;  	}  	/* frame pointer */  	regs[BPF_REG_FP].type = PTR_TO_STACK; -	mark_reg_known_zero(regs, BPF_REG_FP); +	mark_reg_known_zero(env, regs, BPF_REG_FP);  	/* 1st arg to a function */  	regs[BPF_REG_1].type = PTR_TO_CTX; -	mark_reg_known_zero(regs, BPF_REG_1); +	mark_reg_known_zero(env, regs, BPF_REG_1);  }  enum reg_arg_type { @@ -653,6 +624,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)  {  	struct bpf_verifier_state *parent = state->parent; +	if (regno == BPF_REG_FP) +		/* We don't need to worry about FP liveness because it's read-only */ +		return; +  	while (parent) {  		/* if read wasn't screened by an earlier write ... */  		if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -667,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,  			 enum reg_arg_type t)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = env->cur_state->regs;  	if (regno >= MAX_BPF_REG) { -		verbose("R%d is invalid\n", regno); +		verbose(env, "R%d is invalid\n", regno);  		return -EINVAL;  	}  	if (t == SRC_OP) {  		/* check whether register used as source operand can be read */  		if (regs[regno].type == NOT_INIT) { -			verbose("R%d !read_ok\n", regno); +			verbose(env, "R%d !read_ok\n", regno);  			return -EACCES;  		} -		mark_reg_read(&env->cur_state, regno); +		mark_reg_read(env->cur_state, regno);  	} else {  		/* check whether register used as dest operand can be written to */  		if (regno == BPF_REG_FP) { -			verbose("frame pointer is read only\n"); +			verbose(env, "frame pointer is read only\n");  			return -EACCES;  		}  		regs[regno].live |= REG_LIVE_WRITTEN;  		if (t == DST_OP) -			mark_reg_unknown(regs, regno); +			mark_reg_unknown(env, regs, regno);  	}  	return 0;  } @@ -702,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)  	case PTR_TO_STACK:  	case PTR_TO_CTX:  	case PTR_TO_PACKET: +	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET_END:  	case CONST_PTR_TO_MAP:  		return true; @@ -713,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type)  /* check_stack_read/write functions track spill/fill of registers,   * stack boundary and alignment are checked in check_mem_access()   */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, +			     struct bpf_verifier_state *state, int off,  			     int size, int value_regno)  { -	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; +	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + +	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), +				     true); +	if (err) +		return err;  	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,  	 * so it's aligned access and [off, off + size) are within stack limits  	 */ +	if (!env->allow_ptr_leaks && +	    state->stack[spi].slot_type[0] == STACK_SPILL && +	    size != BPF_REG_SIZE) { +		verbose(env, "attempt to corrupt spilled pointer on stack\n"); +		return -EACCES; +	}  	if (value_regno >= 0 &&  	    is_spillable_regtype(state->regs[value_regno].type)) {  		/* register containing pointer is being spilled into stack */  		if (size != BPF_REG_SIZE) { -			verbose("invalid size of register spill\n"); +			verbose(env, "invalid size of register spill\n");  			return -EACCES;  		}  		/* save register state */ -		state->spilled_regs[spi] = state->regs[value_regno]; -		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; +		state->stack[spi].spilled_ptr = state->regs[value_regno]; +		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;  		for (i = 0; i < BPF_REG_SIZE; i++) -			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; +			state->stack[spi].slot_type[i] = STACK_SPILL;  	} else {  		/* regular write of data into stack */ -		state->spilled_regs[spi] = (struct bpf_reg_state) {}; +		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};  		for (i = 0; i < size; i++) -			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; +			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = +				STACK_MISC;  	}  	return 0;  } @@ -752,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo  	while (parent) {  		/* if read wasn't screened by an earlier write ... */ -		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) +		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)  			break;  		/* ... then we depend on parent's value */ -		parent->spilled_regs[slot].live |= REG_LIVE_READ; +		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;  		state = parent;  		parent = state->parent;  	}  } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, +			    struct bpf_verifier_state *state, int off, int size,  			    int value_regno)  { -	u8 *slot_type; -	int i, spi; +	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; +	u8 *stype; -	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; +	if (state->allocated_stack <= slot) { +		verbose(env, "invalid read from stack off %d+0 size %d\n", +			off, size); +		return -EACCES; +	} +	stype = state->stack[spi].slot_type; -	if (slot_type[0] == STACK_SPILL) { +	if (stype[0] == STACK_SPILL) {  		if (size != BPF_REG_SIZE) { -			verbose("invalid size of register spill\n"); +			verbose(env, "invalid size of register spill\n");  			return -EACCES;  		}  		for (i = 1; i < BPF_REG_SIZE; i++) { -			if (slot_type[i] != STACK_SPILL) { -				verbose("corrupted spill memory\n"); +			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { +				verbose(env, "corrupted spill memory\n");  				return -EACCES;  			}  		} -		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; -  		if (value_regno >= 0) {  			/* restore register state from stack */ -			state->regs[value_regno] = state->spilled_regs[spi]; +			state->regs[value_regno] = state->stack[spi].spilled_ptr;  			mark_stack_slot_read(state, spi);  		}  		return 0;  	} else {  		for (i = 0; i < size; i++) { -			if (slot_type[i] != STACK_MISC) { -				verbose("invalid read from stack off %d+%d size %d\n", +			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { +				verbose(env, "invalid read from stack off %d+%d size %d\n",  					off, i, size);  				return -EACCES;  			}  		}  		if (value_regno >= 0)  			/* have read misc data from the stack */ -			mark_reg_unknown(state->regs, value_regno); +			mark_reg_unknown(env, state->regs, value_regno);  		return 0;  	}  }  /* check read/write into map element returned by bpf_map_lookup_elem() */  static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, -			    int size) +			      int size, bool zero_size_allowed)  { -	struct bpf_map *map = env->cur_state.regs[regno].map_ptr; +	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_map *map = regs[regno].map_ptr; -	if (off < 0 || size <= 0 || off + size > map->value_size) { -		verbose("invalid access to map value, value_size=%d off=%d size=%d\n", +	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || +	    off + size > map->value_size) { +		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",  			map->value_size, off, size);  		return -EACCES;  	} @@ -820,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,  /* check read/write into a map element with possible variable offset */  static int check_map_access(struct bpf_verifier_env *env, u32 regno, -				int off, int size) +			    int off, int size, bool zero_size_allowed)  { -	struct bpf_verifier_state *state = &env->cur_state; +	struct bpf_verifier_state *state = env->cur_state;  	struct bpf_reg_state *reg = &state->regs[regno];  	int err; @@ -830,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	 * need to try adding each of min_value and max_value to off  	 * to make sure our theoretical access will be safe.  	 */ -	if (log_level) -		print_verifier_state(state); +	if (env->log.level) +		print_verifier_state(env, state);  	/* The minimum value is only important with signed  	 * comparisons where we can't assume the floor of a  	 * value is 0.  If we are using signed variables for our @@ -839,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	 * will have a set floor within our range.  	 */  	if (reg->smin_value < 0) { -		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",  			regno);  		return -EACCES;  	} -	err = __check_map_access(env, regno, reg->smin_value + off, size); +	err = __check_map_access(env, regno, reg->smin_value + off, size, +				 zero_size_allowed);  	if (err) { -		verbose("R%d min value is outside of the array range\n", regno); +		verbose(env, "R%d min value is outside of the array range\n", +			regno);  		return err;  	} @@ -854,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	 * If reg->umax_value + off could overflow, treat that as unbounded too.  	 */  	if (reg->umax_value >= BPF_MAX_VAR_OFF) { -		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", +		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",  			regno);  		return -EACCES;  	} -	err = __check_map_access(env, regno, reg->umax_value + off, size); +	err = __check_map_access(env, regno, reg->umax_value + off, size, +				 zero_size_allowed);  	if (err) -		verbose("R%d max value is outside of the array range\n", regno); +		verbose(env, "R%d max value is outside of the array range\n", +			regno);  	return err;  } @@ -893,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,  }  static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, -				 int off, int size) +				 int off, int size, bool zero_size_allowed)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	struct bpf_reg_state *reg = ®s[regno]; -	if (off < 0 || size <= 0 || (u64)off + size > reg->range) { -		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", +	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || +	    (u64)off + size > reg->range) { +		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",  			off, size, regno, reg->id, reg->off, reg->range);  		return -EACCES;  	} @@ -907,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,  }  static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, -			       int size) +			       int size, bool zero_size_allowed)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	struct bpf_reg_state *reg = ®s[regno];  	int err; @@ -922,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,  	 * detail to prove they're safe.  	 */  	if (reg->smin_value < 0) { -		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",  			regno);  		return -EACCES;  	} -	err = __check_packet_access(env, regno, off, size); +	err = __check_packet_access(env, regno, off, size, zero_size_allowed);  	if (err) { -		verbose("R%d offset is outside of the packet\n", regno); +		verbose(env, "R%d offset is outside of the packet\n", regno);  		return err;  	}  	return err; @@ -942,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,  		.reg_type = *reg_type,  	}; -	/* for analyzer ctx accesses are already validated and converted */ -	if (env->analyzer_ops) -		return 0; - -	if (env->prog->aux->ops->is_valid_access && -	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) { +	if (env->ops->is_valid_access && +	    env->ops->is_valid_access(off, size, t, &info)) {  		/* A non zero info.ctx_field_size indicates that this field is a  		 * candidate for later verifier transformation to load the whole  		 * field and then apply a mask when accessed with a narrower @@ -955,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,  		 * will only allow for whole field access and rejects any other  		 * type of narrower access.  		 */ -		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;  		*reg_type = info.reg_type; +		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;  		/* remember the offset of last byte accessed in ctx */  		if (env->prog->aux->max_ctx_offset < off + size)  			env->prog->aux->max_ctx_offset = off + size;  		return 0;  	} -	verbose("invalid bpf_context access off=%d size=%d\n", off, size); +	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);  	return -EACCES;  } @@ -979,10 +975,11 @@ static bool __is_pointer_value(bool allow_ptr_leaks,  static bool is_pointer_value(struct bpf_verifier_env *env, int regno)  { -	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);  } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, +				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict)  {  	struct tnum reg_off; @@ -1007,7 +1004,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,  		char tn_buf[48];  		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -		verbose("misaligned packet access off %d+%s+%d+%d size %d\n", +		verbose(env, +			"misaligned packet access off %d+%s+%d+%d size %d\n",  			ip_align, tn_buf, reg->off, off, size);  		return -EACCES;  	} @@ -1015,7 +1013,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,  	return 0;  } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, +				       const struct bpf_reg_state *reg,  				       const char *pointer_desc,  				       int off, int size, bool strict)  { @@ -1030,7 +1029,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,  		char tn_buf[48];  		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -		verbose("misaligned %saccess off %s+%d+%d size %d\n", +		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",  			pointer_desc, tn_buf, reg->off, off, size);  		return -EACCES;  	} @@ -1047,8 +1046,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  	switch (reg->type) {  	case PTR_TO_PACKET: -		/* special case, because of NET_IP_ALIGN */ -		return check_pkt_ptr_alignment(reg, off, size, strict); +	case PTR_TO_PACKET_META: +		/* Special case, because of NET_IP_ALIGN. Given metadata sits +		 * right in front, treat it the very same way. +		 */ +		return check_pkt_ptr_alignment(env, reg, off, size, strict);  	case PTR_TO_MAP_VALUE:  		pointer_desc = "value ";  		break; @@ -1061,7 +1063,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  	default:  		break;  	} -	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); +	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, +					   strict);  }  /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1074,8 +1077,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  			    int bpf_size, enum bpf_access_type t,  			    int value_regno)  { -	struct bpf_verifier_state *state = &env->cur_state; -	struct bpf_reg_state *reg = &state->regs[regno]; +	struct bpf_verifier_state *state = env->cur_state; +	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_reg_state *reg = regs + regno;  	int size, err = 0;  	size = bpf_size_to_bytes(bpf_size); @@ -1093,48 +1097,55 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  	if (reg->type == PTR_TO_MAP_VALUE) {  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) { -			verbose("R%d leaks addr into map\n", value_regno); +			verbose(env, "R%d leaks addr into map\n", value_regno);  			return -EACCES;  		} -		err = check_map_access(env, regno, off, size); +		err = check_map_access(env, regno, off, size, false);  		if (!err && t == BPF_READ && value_regno >= 0) -			mark_reg_unknown(state->regs, value_regno); +			mark_reg_unknown(env, regs, value_regno);  	} else if (reg->type == PTR_TO_CTX) {  		enum bpf_reg_type reg_type = SCALAR_VALUE;  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) { -			verbose("R%d leaks addr into ctx\n", value_regno); +			verbose(env, "R%d leaks addr into ctx\n", value_regno);  			return -EACCES;  		}  		/* ctx accesses must be at a fixed offset, so that we can  		 * determine what type of data were returned.  		 */ -		if (!tnum_is_const(reg->var_off)) { +		if (reg->off) { +			verbose(env, +				"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", +				regno, reg->off, off - reg->off); +			return -EACCES; +		} +		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {  			char tn_buf[48];  			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -			verbose("variable ctx access var_off=%s off=%d size=%d", +			verbose(env, +				"variable ctx access var_off=%s off=%d size=%d",  				tn_buf, off, size);  			return -EACCES;  		} -		off += reg->var_off.value;  		err = check_ctx_access(env, insn_idx, off, size, t, ®_type);  		if (!err && t == BPF_READ && value_regno >= 0) {  			/* ctx access returns either a scalar, or a -			 * PTR_TO_PACKET[_END].  In the latter case, we know -			 * the offset is zero. +			 * PTR_TO_PACKET[_META,_END]. In the latter +			 * case, we know the offset is zero.  			 */  			if (reg_type == SCALAR_VALUE) -				mark_reg_unknown(state->regs, value_regno); +				mark_reg_unknown(env, regs, value_regno);  			else -				mark_reg_known_zero(state->regs, value_regno); -			state->regs[value_regno].id = 0; -			state->regs[value_regno].off = 0; -			state->regs[value_regno].range = 0; -			state->regs[value_regno].type = reg_type; +				mark_reg_known_zero(env, regs, +						    value_regno); +			regs[value_regno].id = 0; +			regs[value_regno].off = 0; +			regs[value_regno].range = 0; +			regs[value_regno].type = reg_type;  		}  	} else if (reg->type == PTR_TO_STACK) { @@ -1146,55 +1157,52 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  			char tn_buf[48];  			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -			verbose("variable stack access var_off=%s off=%d size=%d", +			verbose(env, "variable stack access var_off=%s off=%d size=%d",  				tn_buf, off, size);  			return -EACCES;  		}  		off += reg->var_off.value;  		if (off >= 0 || off < -MAX_BPF_STACK) { -			verbose("invalid stack off=%d size=%d\n", off, size); +			verbose(env, "invalid stack off=%d size=%d\n", off, +				size);  			return -EACCES;  		}  		if (env->prog->aux->stack_depth < -off)  			env->prog->aux->stack_depth = -off; -		if (t == BPF_WRITE) { -			if (!env->allow_ptr_leaks && -			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && -			    size != BPF_REG_SIZE) { -				verbose("attempt to corrupt spilled pointer on stack\n"); -				return -EACCES; -			} -			err = check_stack_write(state, off, size, value_regno); -		} else { -			err = check_stack_read(state, off, size, value_regno); -		} -	} else if (reg->type == PTR_TO_PACKET) { +		if (t == BPF_WRITE) +			err = check_stack_write(env, state, off, size, +						value_regno); +		else +			err = check_stack_read(env, state, off, size, +					       value_regno); +	} else if (reg_is_pkt_pointer(reg)) {  		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { -			verbose("cannot write into packet\n"); +			verbose(env, "cannot write into packet\n");  			return -EACCES;  		}  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) { -			verbose("R%d leaks addr into packet\n", value_regno); +			verbose(env, "R%d leaks addr into packet\n", +				value_regno);  			return -EACCES;  		} -		err = check_packet_access(env, regno, off, size); +		err = check_packet_access(env, regno, off, size, false);  		if (!err && t == BPF_READ && value_regno >= 0) -			mark_reg_unknown(state->regs, value_regno); +			mark_reg_unknown(env, regs, value_regno);  	} else { -		verbose("R%d invalid mem access '%s'\n", -			regno, reg_type_str[reg->type]); +		verbose(env, "R%d invalid mem access '%s'\n", regno, +			reg_type_str[reg->type]);  		return -EACCES;  	}  	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && -	    state->regs[value_regno].type == SCALAR_VALUE) { +	    regs[value_regno].type == SCALAR_VALUE) {  		/* b/h/w load zero-extends, mark upper bits as known 0 */ -		state->regs[value_regno].var_off = tnum_cast( -					state->regs[value_regno].var_off, size); -		__update_reg_bounds(&state->regs[value_regno]); +		regs[value_regno].var_off = +			tnum_cast(regs[value_regno].var_off, size); +		__update_reg_bounds(®s[value_regno]);  	}  	return err;  } @@ -1205,7 +1213,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||  	    insn->imm != 0) { -		verbose("BPF_XADD uses reserved fields\n"); +		verbose(env, "BPF_XADD uses reserved fields\n");  		return -EINVAL;  	} @@ -1220,7 +1228,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  		return err;  	if (is_pointer_value(env, insn->src_reg)) { -		verbose("R%d leaks addr into mem\n", insn->src_reg); +		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);  		return -EACCES;  	} @@ -1251,9 +1259,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  				int access_size, bool zero_size_allowed,  				struct bpf_call_arg_meta *meta)  { -	struct bpf_verifier_state *state = &env->cur_state; +	struct bpf_verifier_state *state = env->cur_state;  	struct bpf_reg_state *regs = state->regs; -	int off, i; +	int off, i, slot, spi;  	if (regs[regno].type != PTR_TO_STACK) {  		/* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1261,7 +1269,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  		    register_is_null(regs[regno]))  			return 0; -		verbose("R%d type=%s expected=%s\n", regno, +		verbose(env, "R%d type=%s expected=%s\n", regno,  			reg_type_str[regs[regno].type],  			reg_type_str[PTR_TO_STACK]);  		return -EACCES; @@ -1272,13 +1280,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  		char tn_buf[48];  		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); -		verbose("invalid variable stack read R%d var_off=%s\n", +		verbose(env, "invalid variable stack read R%d var_off=%s\n",  			regno, tn_buf);  	}  	off = regs[regno].off + regs[regno].var_off.value;  	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || -	    access_size <= 0) { -		verbose("invalid stack type R%d off=%d access_size=%d\n", +	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) { +		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",  			regno, off, access_size);  		return -EACCES;  	} @@ -1293,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  	}  	for (i = 0; i < access_size; i++) { -		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { -			verbose("invalid indirect read from stack off %d+%d size %d\n", +		slot = -(off + i) - 1; +		spi = slot / BPF_REG_SIZE; +		if (state->allocated_stack <= slot || +		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] != +			STACK_MISC) { +			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",  				off, i, access_size);  			return -EACCES;  		} @@ -1306,13 +1318,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,  				   int access_size, bool zero_size_allowed,  				   struct bpf_call_arg_meta *meta)  { -	struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; +	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];  	switch (reg->type) {  	case PTR_TO_PACKET: -		return check_packet_access(env, regno, reg->off, access_size); +	case PTR_TO_PACKET_META: +		return check_packet_access(env, regno, reg->off, access_size, +					   zero_size_allowed);  	case PTR_TO_MAP_VALUE: -		return check_map_access(env, regno, reg->off, access_size); +		return check_map_access(env, regno, reg->off, access_size, +					zero_size_allowed);  	default: /* scalar_value|ptr_to_stack or invalid ptr */  		return check_stack_boundary(env, regno, access_size,  					    zero_size_allowed, meta); @@ -1323,7 +1338,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			  enum bpf_arg_type arg_type,  			  struct bpf_call_arg_meta *meta)  { -	struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; +	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];  	enum bpf_reg_type expected_type, type = reg->type;  	int err = 0; @@ -1336,22 +1351,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  	if (arg_type == ARG_ANYTHING) {  		if (is_pointer_value(env, regno)) { -			verbose("R%d leaks addr into helper function\n", regno); +			verbose(env, "R%d leaks addr into helper function\n", +				regno);  			return -EACCES;  		}  		return 0;  	} -	if (type == PTR_TO_PACKET && +	if (type_is_pkt_pointer(type) &&  	    !may_access_direct_pkt_data(env, meta, BPF_READ)) { -		verbose("helper access to the packet is not allowed\n"); +		verbose(env, "helper access to the packet is not allowed\n");  		return -EACCES;  	}  	if (arg_type == ARG_PTR_TO_MAP_KEY ||  	    arg_type == ARG_PTR_TO_MAP_VALUE) {  		expected_type = PTR_TO_STACK; -		if (type != PTR_TO_PACKET && type != expected_type) +		if (!type_is_pkt_pointer(type) && +		    type != expected_type)  			goto err_type;  	} else if (arg_type == ARG_CONST_SIZE ||  		   arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1367,20 +1384,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		if (type != expected_type)  			goto err_type;  	} else if (arg_type == ARG_PTR_TO_MEM || +		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||  		   arg_type == ARG_PTR_TO_UNINIT_MEM) {  		expected_type = PTR_TO_STACK;  		/* One exception here. In case function allows for NULL to be  		 * passed in as argument, it's a SCALAR_VALUE type. Final test  		 * happens during stack boundary checking.  		 */ -		if (register_is_null(*reg)) +		if (register_is_null(*reg) && +		    arg_type == ARG_PTR_TO_MEM_OR_NULL)  			/* final test in check_stack_boundary() */; -		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && +		else if (!type_is_pkt_pointer(type) && +			 type != PTR_TO_MAP_VALUE &&  			 type != expected_type)  			goto err_type;  		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;  	} else { -		verbose("unsupported arg_type %d\n", arg_type); +		verbose(env, "unsupported arg_type %d\n", arg_type);  		return -EFAULT;  	} @@ -1398,12 +1418,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			 * we have to check map_key here. Otherwise it means  			 * that kernel subsystem misconfigured verifier  			 */ -			verbose("invalid map_ptr to access map->key\n"); +			verbose(env, "invalid map_ptr to access map->key\n");  			return -EACCES;  		} -		if (type == PTR_TO_PACKET) +		if (type_is_pkt_pointer(type))  			err = check_packet_access(env, regno, reg->off, -						  meta->map_ptr->key_size); +						  meta->map_ptr->key_size, +						  false);  		else  			err = check_stack_boundary(env, regno,  						   meta->map_ptr->key_size, @@ -1414,12 +1435,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		 */  		if (!meta->map_ptr) {  			/* kernel subsystem misconfigured verifier */ -			verbose("invalid map_ptr to access map->value\n"); +			verbose(env, "invalid map_ptr to access map->value\n");  			return -EACCES;  		} -		if (type == PTR_TO_PACKET) +		if (type_is_pkt_pointer(type))  			err = check_packet_access(env, regno, reg->off, -						  meta->map_ptr->value_size); +						  meta->map_ptr->value_size, +						  false);  		else  			err = check_stack_boundary(env, regno,  						   meta->map_ptr->value_size, @@ -1434,7 +1456,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		 */  		if (regno == 0) {  			/* kernel subsystem misconfigured verifier */ -			verbose("ARG_CONST_SIZE cannot be first argument\n"); +			verbose(env, +				"ARG_CONST_SIZE cannot be first argument\n");  			return -EACCES;  		} @@ -1451,7 +1474,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			meta = NULL;  		if (reg->smin_value < 0) { -			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", +			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",  				regno);  			return -EACCES;  		} @@ -1465,7 +1488,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		}  		if (reg->umax_value >= BPF_MAX_VAR_SIZ) { -			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", +			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",  				regno);  			return -EACCES;  		} @@ -1476,12 +1499,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  	return err;  err_type: -	verbose("R%d type=%s expected=%s\n", regno, +	verbose(env, "R%d type=%s expected=%s\n", regno,  		reg_type_str[type], reg_type_str[expected_type]);  	return -EACCES;  } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, +					struct bpf_map *map, int func_id)  {  	if (!map)  		return 0; @@ -1494,7 +1518,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		break;  	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:  		if (func_id != BPF_FUNC_perf_event_read && -		    func_id != BPF_FUNC_perf_event_output) +		    func_id != BPF_FUNC_perf_event_output && +		    func_id != BPF_FUNC_perf_event_read_value)  			goto error;  		break;  	case BPF_MAP_TYPE_STACK_TRACE: @@ -1514,6 +1539,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		if (func_id != BPF_FUNC_redirect_map)  			goto error;  		break; +	/* Restrict bpf side of cpumap, open when use-cases appear */ +	case BPF_MAP_TYPE_CPUMAP: +		if (func_id != BPF_FUNC_redirect_map) +			goto error; +		break;  	case BPF_MAP_TYPE_ARRAY_OF_MAPS:  	case BPF_MAP_TYPE_HASH_OF_MAPS:  		if (func_id != BPF_FUNC_map_lookup_elem) @@ -1537,6 +1567,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		break;  	case BPF_FUNC_perf_event_read:  	case BPF_FUNC_perf_event_output: +	case BPF_FUNC_perf_event_read_value:  		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)  			goto error;  		break; @@ -1550,7 +1581,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  			goto error;  		break;  	case BPF_FUNC_redirect_map: -		if (map->map_type != BPF_MAP_TYPE_DEVMAP) +		if (map->map_type != BPF_MAP_TYPE_DEVMAP && +		    map->map_type != BPF_MAP_TYPE_CPUMAP)  			goto error;  		break;  	case BPF_FUNC_sk_redirect_map: @@ -1567,7 +1599,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  	return 0;  error: -	verbose("cannot pass map_type %d into func %s#%d\n", +	verbose(env, "cannot pass map_type %d into func %s#%d\n",  		map->map_type, func_id_name(func_id), func_id);  	return -EINVAL;  } @@ -1590,57 +1622,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn)  	return count > 1 ? -EINVAL : 0;  } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE.   */  static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  { -	struct bpf_verifier_state *state = &env->cur_state; +	struct bpf_verifier_state *state = env->cur_state;  	struct bpf_reg_state *regs = state->regs, *reg;  	int i;  	for (i = 0; i < MAX_BPF_REG; i++) -		if (regs[i].type == PTR_TO_PACKET || -		    regs[i].type == PTR_TO_PACKET_END) -			mark_reg_unknown(regs, i); +		if (reg_is_pkt_pointer_any(®s[i])) +			mark_reg_unknown(env, regs, i); -	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { -		if (state->stack_slot_type[i] != STACK_SPILL) -			continue; -		reg = &state->spilled_regs[i / BPF_REG_SIZE]; -		if (reg->type != PTR_TO_PACKET && -		    reg->type != PTR_TO_PACKET_END) +	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +		if (state->stack[i].slot_type[0] != STACK_SPILL)  			continue; -		__mark_reg_unknown(reg); +		reg = &state->stack[i].spilled_ptr; +		if (reg_is_pkt_pointer_any(reg)) +			__mark_reg_unknown(reg);  	}  }  static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  { -	struct bpf_verifier_state *state = &env->cur_state;  	const struct bpf_func_proto *fn = NULL; -	struct bpf_reg_state *regs = state->regs; +	struct bpf_reg_state *regs;  	struct bpf_call_arg_meta meta;  	bool changes_data;  	int i, err;  	/* find function prototype */  	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { -		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); +		verbose(env, "invalid func %s#%d\n", func_id_name(func_id), +			func_id);  		return -EINVAL;  	} -	if (env->prog->aux->ops->get_func_proto) -		fn = env->prog->aux->ops->get_func_proto(func_id); +	if (env->ops->get_func_proto) +		fn = env->ops->get_func_proto(func_id);  	if (!fn) { -		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); +		verbose(env, "unknown func %s#%d\n", func_id_name(func_id), +			func_id);  		return -EINVAL;  	}  	/* eBPF programs must be GPL compatible to use GPL-ed functions */  	if (!env->prog->gpl_compatible && fn->gpl_only) { -		verbose("cannot call GPL only function from proprietary program\n"); +		verbose(env, "cannot call GPL only function from proprietary program\n");  		return -EINVAL;  	} @@ -1654,7 +1684,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  	 */  	err = check_raw_mode(fn);  	if (err) { -		verbose("kernel subsystem misconfigured func %s#%d\n", +		verbose(env, "kernel subsystem misconfigured func %s#%d\n",  			func_id_name(func_id), func_id);  		return err;  	} @@ -1685,16 +1715,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  			return err;  	} +	regs = cur_regs(env);  	/* reset caller saved regs */  	for (i = 0; i < CALLER_SAVED_REGS; i++) { -		mark_reg_not_init(regs, caller_saved[i]); +		mark_reg_not_init(env, regs, caller_saved[i]);  		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);  	}  	/* update return register (already marked as written above) */  	if (fn->ret_type == RET_INTEGER) {  		/* sets type to SCALAR_VALUE */ -		mark_reg_unknown(regs, BPF_REG_0); +		mark_reg_unknown(env, regs, BPF_REG_0);  	} else if (fn->ret_type == RET_VOID) {  		regs[BPF_REG_0].type = NOT_INIT;  	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1702,14 +1733,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;  		/* There is no offset yet applied, variable or fixed */ -		mark_reg_known_zero(regs, BPF_REG_0); +		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].off = 0;  		/* remember map_ptr, so that check_map_access()  		 * can check 'value_size' boundary of memory access  		 * to map element returned from bpf_map_lookup_elem()  		 */  		if (meta.map_ptr == NULL) { -			verbose("kernel subsystem misconfigured verifier\n"); +			verbose(env, +				"kernel subsystem misconfigured verifier\n");  			return -EINVAL;  		}  		regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1720,12 +1752,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  		else if (insn_aux->map_ptr != meta.map_ptr)  			insn_aux->map_ptr = BPF_MAP_PTR_POISON;  	} else { -		verbose("unknown return type %d of func %s#%d\n", +		verbose(env, "unknown return type %d of func %s#%d\n",  			fn->ret_type, func_id_name(func_id), func_id);  		return -EINVAL;  	} -	err = check_map_func_compatibility(meta.map_ptr, func_id); +	err = check_map_func_compatibility(env, meta.map_ptr, func_id);  	if (err)  		return err; @@ -1772,7 +1804,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *ptr_reg,  				   const struct bpf_reg_state *off_reg)  { -	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; +	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;  	bool known = tnum_is_const(off_reg->var_off);  	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,  	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1784,39 +1816,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	dst_reg = ®s[dst];  	if (WARN_ON_ONCE(known && (smin_val != smax_val))) { -		print_verifier_state(&env->cur_state); -		verbose("verifier internal error: known but bad sbounds\n"); +		print_verifier_state(env, env->cur_state); +		verbose(env, +			"verifier internal error: known but bad sbounds\n");  		return -EINVAL;  	}  	if (WARN_ON_ONCE(known && (umin_val != umax_val))) { -		print_verifier_state(&env->cur_state); -		verbose("verifier internal error: known but bad ubounds\n"); +		print_verifier_state(env, env->cur_state); +		verbose(env, +			"verifier internal error: known but bad ubounds\n");  		return -EINVAL;  	}  	if (BPF_CLASS(insn->code) != BPF_ALU64) {  		/* 32-bit ALU ops on pointers produce (meaningless) scalars */  		if (!env->allow_ptr_leaks) -			verbose("R%d 32-bit pointer arithmetic prohibited\n", +			verbose(env, +				"R%d 32-bit pointer arithmetic prohibited\n",  				dst);  		return -EACCES;  	}  	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {  		if (!env->allow_ptr_leaks) -			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", +			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",  				dst);  		return -EACCES;  	}  	if (ptr_reg->type == CONST_PTR_TO_MAP) {  		if (!env->allow_ptr_leaks) -			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", +			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",  				dst);  		return -EACCES;  	}  	if (ptr_reg->type == PTR_TO_PACKET_END) {  		if (!env->allow_ptr_leaks) -			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", +			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",  				dst);  		return -EACCES;  	} @@ -1871,7 +1906,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		}  		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);  		dst_reg->off = ptr_reg->off; -		if (ptr_reg->type == PTR_TO_PACKET) { +		if (reg_is_pkt_pointer(ptr_reg)) {  			dst_reg->id = ++env->id_gen;  			/* something was added to pkt_ptr, set range to zero */  			dst_reg->range = 0; @@ -1881,7 +1916,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		if (dst_reg == off_reg) {  			/* scalar -= pointer.  Creates an unknown scalar */  			if (!env->allow_ptr_leaks) -				verbose("R%d tried to subtract pointer from scalar\n", +				verbose(env, "R%d tried to subtract pointer from scalar\n",  					dst);  			return -EACCES;  		} @@ -1891,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		 */  		if (ptr_reg->type == PTR_TO_STACK) {  			if (!env->allow_ptr_leaks) -				verbose("R%d subtraction from stack pointer prohibited\n", +				verbose(env, "R%d subtraction from stack pointer prohibited\n",  					dst);  			return -EACCES;  		} @@ -1931,7 +1966,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		}  		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);  		dst_reg->off = ptr_reg->off; -		if (ptr_reg->type == PTR_TO_PACKET) { +		if (reg_is_pkt_pointer(ptr_reg)) {  			dst_reg->id = ++env->id_gen;  			/* something was added to pkt_ptr, set range to zero */  			if (smin_val < 0) @@ -1946,13 +1981,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		 * ptr &= ~3 which would reduce min_value by 3.)  		 */  		if (!env->allow_ptr_leaks) -			verbose("R%d bitwise operator %s on pointer prohibited\n", +			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",  				dst, bpf_alu_string[opcode >> 4]);  		return -EACCES;  	default:  		/* other operators (e.g. MUL,LSH) produce non-pointer results */  		if (!env->allow_ptr_leaks) -			verbose("R%d pointer arithmetic with %s operator prohibited\n", +			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",  				dst, bpf_alu_string[opcode >> 4]);  		return -EACCES;  	} @@ -1968,7 +2003,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  				      struct bpf_reg_state *dst_reg,  				      struct bpf_reg_state src_reg)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	u8 opcode = BPF_OP(insn->code);  	bool src_known, dst_known;  	s64 smin_val, smax_val; @@ -2118,7 +2153,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  			/* Shifts greater than 63 are undefined.  This includes  			 * shifts by a negative number.  			 */ -			mark_reg_unknown(regs, insn->dst_reg); +			mark_reg_unknown(env, regs, insn->dst_reg);  			break;  		}  		/* We lose all sign bit information (except what we can pick @@ -2146,7 +2181,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  			/* Shifts greater than 63 are undefined.  This includes  			 * shifts by a negative number.  			 */ -			mark_reg_unknown(regs, insn->dst_reg); +			mark_reg_unknown(env, regs, insn->dst_reg);  			break;  		}  		/* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2174,7 +2209,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  		__update_reg_bounds(dst_reg);  		break;  	default: -		mark_reg_unknown(regs, insn->dst_reg); +		mark_reg_unknown(env, regs, insn->dst_reg);  		break;  	} @@ -2189,7 +2224,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				   struct bpf_insn *insn)  { -	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; +	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;  	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};  	u8 opcode = BPF_OP(insn->code);  	int rc; @@ -2206,12 +2241,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				 * an arbitrary scalar.  				 */  				if (!env->allow_ptr_leaks) { -					verbose("R%d pointer %s pointer prohibited\n", +					verbose(env, "R%d pointer %s pointer prohibited\n",  						insn->dst_reg,  						bpf_alu_string[opcode >> 4]);  					return -EACCES;  				} -				mark_reg_unknown(regs, insn->dst_reg); +				mark_reg_unknown(env, regs, insn->dst_reg);  				return 0;  			} else {  				/* scalar += pointer @@ -2263,13 +2298,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	/* Got here implies adding two SCALAR_VALUEs */  	if (WARN_ON_ONCE(ptr_reg)) { -		print_verifier_state(&env->cur_state); -		verbose("verifier internal error: unexpected ptr_reg\n"); +		print_verifier_state(env, env->cur_state); +		verbose(env, "verifier internal error: unexpected ptr_reg\n");  		return -EINVAL;  	}  	if (WARN_ON(!src_reg)) { -		print_verifier_state(&env->cur_state); -		verbose("verifier internal error: no src_reg\n"); +		print_verifier_state(env, env->cur_state); +		verbose(env, "verifier internal error: no src_reg\n");  		return -EINVAL;  	}  	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2278,7 +2313,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  /* check validity of 32-bit and 64-bit arithmetic operations */  static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	u8 opcode = BPF_OP(insn->code);  	int err; @@ -2287,14 +2322,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			if (BPF_SRC(insn->code) != 0 ||  			    insn->src_reg != BPF_REG_0 ||  			    insn->off != 0 || insn->imm != 0) { -				verbose("BPF_NEG uses reserved fields\n"); +				verbose(env, "BPF_NEG uses reserved fields\n");  				return -EINVAL;  			}  		} else {  			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||  			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||  			    BPF_CLASS(insn->code) == BPF_ALU64) { -				verbose("BPF_END uses reserved fields\n"); +				verbose(env, "BPF_END uses reserved fields\n");  				return -EINVAL;  			}  		} @@ -2305,7 +2340,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			return err;  		if (is_pointer_value(env, insn->dst_reg)) { -			verbose("R%d pointer arithmetic prohibited\n", +			verbose(env, "R%d pointer arithmetic prohibited\n",  				insn->dst_reg);  			return -EACCES;  		} @@ -2319,7 +2354,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  		if (BPF_SRC(insn->code) == BPF_X) {  			if (insn->imm != 0 || insn->off != 0) { -				verbose("BPF_MOV uses reserved fields\n"); +				verbose(env, "BPF_MOV uses reserved fields\n");  				return -EINVAL;  			} @@ -2329,7 +2364,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  				return err;  		} else {  			if (insn->src_reg != BPF_REG_0 || insn->off != 0) { -				verbose("BPF_MOV uses reserved fields\n"); +				verbose(env, "BPF_MOV uses reserved fields\n");  				return -EINVAL;  			}  		} @@ -2345,14 +2380,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  				 * copy register state to dest reg  				 */  				regs[insn->dst_reg] = regs[insn->src_reg]; +				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;  			} else {  				/* R1 = (u32) R2 */  				if (is_pointer_value(env, insn->src_reg)) { -					verbose("R%d partial copy of pointer\n", +					verbose(env, +						"R%d partial copy of pointer\n",  						insn->src_reg);  					return -EACCES;  				} -				mark_reg_unknown(regs, insn->dst_reg); +				mark_reg_unknown(env, regs, insn->dst_reg);  				/* high 32 bits are known zero. */  				regs[insn->dst_reg].var_off = tnum_cast(  						regs[insn->dst_reg].var_off, 4); @@ -2367,14 +2404,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  		}  	} else if (opcode > BPF_END) { -		verbose("invalid BPF_ALU opcode %x\n", opcode); +		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);  		return -EINVAL;  	} else {	/* all other ALU ops: and, sub, xor, add, ... */  		if (BPF_SRC(insn->code) == BPF_X) {  			if (insn->imm != 0 || insn->off != 0) { -				verbose("BPF_ALU uses reserved fields\n"); +				verbose(env, "BPF_ALU uses reserved fields\n");  				return -EINVAL;  			}  			/* check src1 operand */ @@ -2383,7 +2420,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  				return err;  		} else {  			if (insn->src_reg != BPF_REG_0 || insn->off != 0) { -				verbose("BPF_ALU uses reserved fields\n"); +				verbose(env, "BPF_ALU uses reserved fields\n");  				return -EINVAL;  			}  		} @@ -2395,7 +2432,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&  		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { -			verbose("div by zero\n"); +			verbose(env, "div by zero\n");  			return -EINVAL;  		} @@ -2404,7 +2441,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;  			if (insn->imm < 0 || insn->imm >= size) { -				verbose("invalid shift %d\n", insn->imm); +				verbose(env, "invalid shift %d\n", insn->imm);  				return -EINVAL;  			}  		} @@ -2421,12 +2458,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  }  static void find_good_pkt_pointers(struct bpf_verifier_state *state, -				   struct bpf_reg_state *dst_reg) +				   struct bpf_reg_state *dst_reg, +				   enum bpf_reg_type type, +				   bool range_right_open)  {  	struct bpf_reg_state *regs = state->regs, *reg; +	u16 new_range;  	int i; -	if (dst_reg->off < 0) +	if (dst_reg->off < 0 || +	    (dst_reg->off == 0 && range_right_open))  		/* This doesn't give us any range */  		return; @@ -2437,9 +2478,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  		 */  		return; -	/* LLVM can generate four kind of checks: +	new_range = dst_reg->off; +	if (range_right_open) +		new_range--; + +	/* Examples for register markings:  	 * -	 * Type 1/2: +	 * pkt_data in dst register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2456,7 +2501,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r2=pkt(id=n,off=8,r=0)  	 *     r3=pkt(id=n,off=0,r=0)  	 * -	 * Type 3/4: +	 * pkt_data in src register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2474,7 +2519,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r3=pkt(id=n,off=0,r=0)  	 *  	 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) -	 * so that range of bytes [r3, r3 + 8) is safe to access. +	 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) +	 * and [r3, r3 + 8-1) respectively is safe to access depending on +	 * the check.  	 */  	/* If our ids match, then we must have the same max_value.  And we @@ -2483,16 +2530,16 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.  	 */  	for (i = 0; i < MAX_BPF_REG; i++) -		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) +		if (regs[i].type == type && regs[i].id == dst_reg->id)  			/* keep the maximum range already checked */ -			regs[i].range = max_t(u16, regs[i].range, dst_reg->off); +			regs[i].range = max(regs[i].range, new_range); -	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { -		if (state->stack_slot_type[i] != STACK_SPILL) +	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +		if (state->stack[i].slot_type[0] != STACK_SPILL)  			continue; -		reg = &state->spilled_regs[i / BPF_REG_SIZE]; -		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) -			reg->range = max_t(u16, reg->range, dst_reg->off); +		reg = &state->stack[i].spilled_ptr; +		if (reg->type == type && reg->id == dst_reg->id) +			reg->range = max(reg->range, new_range);  	}  } @@ -2740,29 +2787,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,  	for (i = 0; i < MAX_BPF_REG; i++)  		mark_map_reg(regs, i, id, is_null); -	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { -		if (state->stack_slot_type[i] != STACK_SPILL) +	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +		if (state->stack[i].slot_type[0] != STACK_SPILL)  			continue; -		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); +		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); +	} +} + +static bool try_match_pkt_pointers(const struct bpf_insn *insn, +				   struct bpf_reg_state *dst_reg, +				   struct bpf_reg_state *src_reg, +				   struct bpf_verifier_state *this_branch, +				   struct bpf_verifier_state *other_branch) +{ +	if (BPF_SRC(insn->code) != BPF_X) +		return false; + +	switch (BPF_OP(insn->code)) { +	case BPF_JGT: +		if ((dst_reg->type == PTR_TO_PACKET && +		     src_reg->type == PTR_TO_PACKET_END) || +		    (dst_reg->type == PTR_TO_PACKET_META && +		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */ +			find_good_pkt_pointers(this_branch, dst_reg, +					       dst_reg->type, false); +		} else if ((dst_reg->type == PTR_TO_PACKET_END && +			    src_reg->type == PTR_TO_PACKET) || +			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +			    src_reg->type == PTR_TO_PACKET_META)) { +			/* pkt_end > pkt_data', pkt_data > pkt_meta' */ +			find_good_pkt_pointers(other_branch, src_reg, +					       src_reg->type, true); +		} else { +			return false; +		} +		break; +	case BPF_JLT: +		if ((dst_reg->type == PTR_TO_PACKET && +		     src_reg->type == PTR_TO_PACKET_END) || +		    (dst_reg->type == PTR_TO_PACKET_META && +		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */ +			find_good_pkt_pointers(other_branch, dst_reg, +					       dst_reg->type, true); +		} else if ((dst_reg->type == PTR_TO_PACKET_END && +			    src_reg->type == PTR_TO_PACKET) || +			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +			    src_reg->type == PTR_TO_PACKET_META)) { +			/* pkt_end < pkt_data', pkt_data > pkt_meta' */ +			find_good_pkt_pointers(this_branch, src_reg, +					       src_reg->type, false); +		} else { +			return false; +		} +		break; +	case BPF_JGE: +		if ((dst_reg->type == PTR_TO_PACKET && +		     src_reg->type == PTR_TO_PACKET_END) || +		    (dst_reg->type == PTR_TO_PACKET_META && +		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ +			find_good_pkt_pointers(this_branch, dst_reg, +					       dst_reg->type, true); +		} else if ((dst_reg->type == PTR_TO_PACKET_END && +			    src_reg->type == PTR_TO_PACKET) || +			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +			    src_reg->type == PTR_TO_PACKET_META)) { +			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ +			find_good_pkt_pointers(other_branch, src_reg, +					       src_reg->type, false); +		} else { +			return false; +		} +		break; +	case BPF_JLE: +		if ((dst_reg->type == PTR_TO_PACKET && +		     src_reg->type == PTR_TO_PACKET_END) || +		    (dst_reg->type == PTR_TO_PACKET_META && +		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ +			find_good_pkt_pointers(other_branch, dst_reg, +					       dst_reg->type, false); +		} else if ((dst_reg->type == PTR_TO_PACKET_END && +			    src_reg->type == PTR_TO_PACKET) || +			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +			    src_reg->type == PTR_TO_PACKET_META)) { +			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ +			find_good_pkt_pointers(this_branch, src_reg, +					       src_reg->type, true); +		} else { +			return false; +		} +		break; +	default: +		return false;  	} + +	return true;  }  static int check_cond_jmp_op(struct bpf_verifier_env *env,  			     struct bpf_insn *insn, int *insn_idx)  { -	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; +	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;  	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;  	u8 opcode = BPF_OP(insn->code);  	int err;  	if (opcode > BPF_JSLE) { -		verbose("invalid BPF_JMP opcode %x\n", opcode); +		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);  		return -EINVAL;  	}  	if (BPF_SRC(insn->code) == BPF_X) {  		if (insn->imm != 0) { -			verbose("BPF_JMP uses reserved fields\n"); +			verbose(env, "BPF_JMP uses reserved fields\n");  			return -EINVAL;  		} @@ -2772,13 +2912,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  			return err;  		if (is_pointer_value(env, insn->src_reg)) { -			verbose("R%d pointer comparison prohibited\n", +			verbose(env, "R%d pointer comparison prohibited\n",  				insn->src_reg);  			return -EACCES;  		}  	} else {  		if (insn->src_reg != BPF_REG_0) { -			verbose("BPF_JMP uses reserved fields\n"); +			verbose(env, "BPF_JMP uses reserved fields\n");  			return -EINVAL;  		}  	} @@ -2853,28 +2993,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  		 */  		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);  		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); -	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && -		   dst_reg->type == PTR_TO_PACKET && -		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(this_branch, dst_reg); -	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && -		   dst_reg->type == PTR_TO_PACKET && -		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(other_branch, dst_reg); -	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && -		   dst_reg->type == PTR_TO_PACKET_END && -		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); -	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && -		   dst_reg->type == PTR_TO_PACKET_END && -		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); -	} else if (is_pointer_value(env, insn->dst_reg)) { -		verbose("R%d pointer comparison prohibited\n", insn->dst_reg); +	} else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], +					   this_branch, other_branch) && +		   is_pointer_value(env, insn->dst_reg)) { +		verbose(env, "R%d pointer comparison prohibited\n", +			insn->dst_reg);  		return -EACCES;  	} -	if (log_level) -		print_verifier_state(this_branch); +	if (env->log.level) +		print_verifier_state(env, this_branch);  	return 0;  } @@ -2889,15 +3016,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)  /* verify BPF_LD_IMM64 instruction */  static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	int err;  	if (BPF_SIZE(insn->code) != BPF_DW) { -		verbose("invalid BPF_LD_IMM insn\n"); +		verbose(env, "invalid BPF_LD_IMM insn\n");  		return -EINVAL;  	}  	if (insn->off != 0) { -		verbose("BPF_LD_IMM64 uses reserved fields\n"); +		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");  		return -EINVAL;  	} @@ -2950,19 +3077,19 @@ static bool may_access_skb(enum bpf_prog_type type)   */  static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  { -	struct bpf_reg_state *regs = env->cur_state.regs; +	struct bpf_reg_state *regs = cur_regs(env);  	u8 mode = BPF_MODE(insn->code);  	int i, err;  	if (!may_access_skb(env->prog->type)) { -		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); +		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");  		return -EINVAL;  	}  	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||  	    BPF_SIZE(insn->code) == BPF_DW ||  	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { -		verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); +		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");  		return -EINVAL;  	} @@ -2972,7 +3099,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  		return err;  	if (regs[BPF_REG_6].type != PTR_TO_CTX) { -		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); +		verbose(env, +			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");  		return -EINVAL;  	} @@ -2985,7 +3113,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  	/* reset caller saved regs to unreadable */  	for (i = 0; i < CALLER_SAVED_REGS; i++) { -		mark_reg_not_init(regs, caller_saved[i]); +		mark_reg_not_init(env, regs, caller_saved[i]);  		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);  	} @@ -2993,7 +3121,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  	 * the value fetched from the packet.  	 * Already marked as written above.  	 */ -	mark_reg_unknown(regs, BPF_REG_0); +	mark_reg_unknown(env, regs, BPF_REG_0); +	return 0; +} + +static int check_return_code(struct bpf_verifier_env *env) +{ +	struct bpf_reg_state *reg; +	struct tnum range = tnum_range(0, 1); + +	switch (env->prog->type) { +	case BPF_PROG_TYPE_CGROUP_SKB: +	case BPF_PROG_TYPE_CGROUP_SOCK: +	case BPF_PROG_TYPE_SOCK_OPS: +	case BPF_PROG_TYPE_CGROUP_DEVICE: +		break; +	default: +		return 0; +	} + +	reg = cur_regs(env) + BPF_REG_0; +	if (reg->type != SCALAR_VALUE) { +		verbose(env, "At program exit the register R0 is not a known value (%s)\n", +			reg_type_str[reg->type]); +		return -EINVAL; +	} + +	if (!tnum_in(range, reg->var_off)) { +		verbose(env, "At program exit the register R0 "); +		if (!tnum_is_unknown(reg->var_off)) { +			char tn_buf[48]; + +			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +			verbose(env, "has value %s", tn_buf); +		} else { +			verbose(env, "has unknown scalar value"); +		} +		verbose(env, " should have been 0 or 1\n"); +		return -EINVAL; +	}  	return 0;  } @@ -3057,7 +3223,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)  		return 0;  	if (w < 0 || w >= env->prog->len) { -		verbose("jump out of range from insn %d to %d\n", t, w); +		verbose(env, "jump out of range from insn %d to %d\n", t, w);  		return -EINVAL;  	} @@ -3074,13 +3240,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)  		insn_stack[cur_stack++] = w;  		return 1;  	} else if ((insn_state[w] & 0xF0) == DISCOVERED) { -		verbose("back-edge from insn %d to %d\n", t, w); +		verbose(env, "back-edge from insn %d to %d\n", t, w);  		return -EINVAL;  	} else if (insn_state[w] == EXPLORED) {  		/* forward- or cross-edge */  		insn_state[t] = DISCOVERED | e;  	} else { -		verbose("insn state internal bug\n"); +		verbose(env, "insn state internal bug\n");  		return -EFAULT;  	}  	return 0; @@ -3174,7 +3340,7 @@ peek_stack:  mark_explored:  	insn_state[t] = EXPLORED;  	if (cur_stack-- <= 0) { -		verbose("pop stack internal bug\n"); +		verbose(env, "pop stack internal bug\n");  		ret = -EFAULT;  		goto err_free;  	} @@ -3183,7 +3349,7 @@ mark_explored:  check_state:  	for (i = 0; i < insn_cnt; i++) {  		if (insn_state[i] != EXPLORED) { -			verbose("unreachable insn %d\n", i); +			verbose(env, "unreachable insn %d\n", i);  			ret = -EINVAL;  			goto err_free;  		} @@ -3298,8 +3464,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  			return false;  		/* Check our ids match any regs they're supposed to */  		return check_ids(rold->id, rcur->id, idmap); +	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET: -		if (rcur->type != PTR_TO_PACKET) +		if (rcur->type != rold->type)  			return false;  		/* We must have at least as much range as the old ptr  		 * did, so that any accesses which were safe before are @@ -3337,6 +3504,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  	return false;  } +static bool stacksafe(struct bpf_verifier_state *old, +		      struct bpf_verifier_state *cur, +		      struct idpair *idmap) +{ +	int i, spi; + +	/* if explored stack has more populated slots than current stack +	 * such stacks are not equivalent +	 */ +	if (old->allocated_stack > cur->allocated_stack) +		return false; + +	/* walk slots of the explored stack and ignore any additional +	 * slots in the current stack, since explored(safe) state +	 * didn't use them +	 */ +	for (i = 0; i < old->allocated_stack; i++) { +		spi = i / BPF_REG_SIZE; + +		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) +			continue; +		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != +		    cur->stack[spi].slot_type[i % BPF_REG_SIZE]) +			/* Ex: old explored (safe) state has STACK_SPILL in +			 * this stack slot, but current has has STACK_MISC -> +			 * this verifier states are not equivalent, +			 * return false to continue verification of this path +			 */ +			return false; +		if (i % BPF_REG_SIZE) +			continue; +		if (old->stack[spi].slot_type[0] != STACK_SPILL) +			continue; +		if (!regsafe(&old->stack[spi].spilled_ptr, +			     &cur->stack[spi].spilled_ptr, +			     idmap)) +			/* when explored and current stack slot are both storing +			 * spilled registers, check that stored pointers types +			 * are the same as well. +			 * Ex: explored safe path could have stored +			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} +			 * but current path has stored: +			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} +			 * such verifier states are not equivalent. +			 * return false to continue verification of this path +			 */ +			return false; +	} +	return true; +} +  /* compare two verifier states   *   * all states stored in state_list are known to be valid, since @@ -3381,37 +3599,8 @@ static bool states_equal(struct bpf_verifier_env *env,  			goto out_free;  	} -	for (i = 0; i < MAX_BPF_STACK; i++) { -		if (old->stack_slot_type[i] == STACK_INVALID) -			continue; -		if (old->stack_slot_type[i] != cur->stack_slot_type[i]) -			/* Ex: old explored (safe) state has STACK_SPILL in -			 * this stack slot, but current has has STACK_MISC -> -			 * this verifier states are not equivalent, -			 * return false to continue verification of this path -			 */ -			goto out_free; -		if (i % BPF_REG_SIZE) -			continue; -		if (old->stack_slot_type[i] != STACK_SPILL) -			continue; -		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], -			     &cur->spilled_regs[i / BPF_REG_SIZE], -			     idmap)) -			/* when explored and current stack slot are both storing -			 * spilled registers, check that stored pointers types -			 * are the same as well. -			 * Ex: explored safe path could have stored -			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} -			 * but current path has stored: -			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} -			 * such verifier states are not equivalent. -			 * return false to continue verification of this path -			 */ -			goto out_free; -		else -			continue; -	} +	if (!stacksafe(old, cur, idmap)) +		goto out_free;  	ret = true;  out_free:  	kfree(idmap); @@ -3447,17 +3636,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,  		}  	}  	/* ... and stack slots */ -	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { -		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) +	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && +		    i < parent->allocated_stack / BPF_REG_SIZE; i++) { +		if (parent->stack[i].slot_type[0] != STACK_SPILL)  			continue; -		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) +		if (state->stack[i].slot_type[0] != STACK_SPILL)  			continue; -		if (parent->spilled_regs[i].live & REG_LIVE_READ) +		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)  			continue; -		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) +		if (writes && +		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))  			continue; -		if (state->spilled_regs[i].live & REG_LIVE_READ) { -			parent->spilled_regs[i].live |= REG_LIVE_READ; +		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { +			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;  			touched = true;  		}  	} @@ -3487,7 +3678,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  {  	struct bpf_verifier_state_list *new_sl;  	struct bpf_verifier_state_list *sl; -	int i; +	struct bpf_verifier_state *cur = env->cur_state; +	int i, err;  	sl = env->explored_states[insn_idx];  	if (!sl) @@ -3497,7 +3689,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  		return 0;  	while (sl != STATE_LIST_MARK) { -		if (states_equal(env, &sl->state, &env->cur_state)) { +		if (states_equal(env, &sl->state, cur)) {  			/* reached equivalent register/stack state,  			 * prune the search.  			 * Registers read by the continuation are read by us. @@ -3508,7 +3700,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  			 * they'll be immediately forgotten as we're pruning  			 * this state and will pop a new one.  			 */ -			propagate_liveness(&sl->state, &env->cur_state); +			propagate_liveness(&sl->state, cur);  			return 1;  		}  		sl = sl->next; @@ -3520,16 +3712,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	 * it will be rejected. Since there are no loops, we won't be  	 * seeing this 'insn_idx' instruction again on the way to bpf_exit  	 */ -	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); +	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);  	if (!new_sl)  		return -ENOMEM;  	/* add new state to the head of linked list */ -	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); +	err = copy_verifier_state(&new_sl->state, cur); +	if (err) { +		free_verifier_state(&new_sl->state, false); +		kfree(new_sl); +		return err; +	}  	new_sl->next = env->explored_states[insn_idx];  	env->explored_states[insn_idx] = new_sl;  	/* connect new state to parentage chain */ -	env->cur_state.parent = &new_sl->state; +	cur->parent = &new_sl->state;  	/* clear write marks in current state: the writes we did are not writes  	 * our child did, so they don't screen off its reads from us.  	 * (There are no read marks in current state, because reads always mark @@ -3537,33 +3734,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	 * explored_states can get read marks.)  	 */  	for (i = 0; i < BPF_REG_FP; i++) -		env->cur_state.regs[i].live = REG_LIVE_NONE; -	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) -		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) -			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; +		cur->regs[i].live = REG_LIVE_NONE; +	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) +		if (cur->stack[i].slot_type[0] == STACK_SPILL) +			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;  	return 0;  }  static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,  				  int insn_idx, int prev_insn_idx)  { -	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) -		return 0; +	if (env->dev_ops && env->dev_ops->insn_hook) +		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); -	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); +	return 0;  }  static int do_check(struct bpf_verifier_env *env)  { -	struct bpf_verifier_state *state = &env->cur_state; +	struct bpf_verifier_state *state;  	struct bpf_insn *insns = env->prog->insnsi; -	struct bpf_reg_state *regs = state->regs; +	struct bpf_reg_state *regs;  	int insn_cnt = env->prog->len;  	int insn_idx, prev_insn_idx = 0;  	int insn_processed = 0;  	bool do_print_state = false; -	init_reg_state(regs); +	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); +	if (!state) +		return -ENOMEM; +	env->cur_state = state; +	init_reg_state(env, state->regs);  	state->parent = NULL;  	insn_idx = 0;  	for (;;) { @@ -3572,7 +3773,7 @@ static int do_check(struct bpf_verifier_env *env)  		int err;  		if (insn_idx >= insn_cnt) { -			verbose("invalid insn idx %d insn_cnt %d\n", +			verbose(env, "invalid insn idx %d insn_cnt %d\n",  				insn_idx, insn_cnt);  			return -EFAULT;  		} @@ -3581,7 +3782,8 @@ static int do_check(struct bpf_verifier_env *env)  		class = BPF_CLASS(insn->code);  		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { -			verbose("BPF program is too large. Processed %d insn\n", +			verbose(env, +				"BPF program is too large. Processed %d insn\n",  				insn_processed);  			return -E2BIG;  		} @@ -3591,12 +3793,12 @@ static int do_check(struct bpf_verifier_env *env)  			return err;  		if (err == 1) {  			/* found equivalent state, can prune the search */ -			if (log_level) { +			if (env->log.level) {  				if (do_print_state) -					verbose("\nfrom %d to %d: safe\n", +					verbose(env, "\nfrom %d to %d: safe\n",  						prev_insn_idx, insn_idx);  				else -					verbose("%d: safe\n", insn_idx); +					verbose(env, "%d: safe\n", insn_idx);  			}  			goto process_bpf_exit;  		} @@ -3604,25 +3806,28 @@ static int do_check(struct bpf_verifier_env *env)  		if (need_resched())  			cond_resched(); -		if (log_level > 1 || (log_level && do_print_state)) { -			if (log_level > 1) -				verbose("%d:", insn_idx); +		if (env->log.level > 1 || (env->log.level && do_print_state)) { +			if (env->log.level > 1) +				verbose(env, "%d:", insn_idx);  			else -				verbose("\nfrom %d to %d:", +				verbose(env, "\nfrom %d to %d:",  					prev_insn_idx, insn_idx); -			print_verifier_state(&env->cur_state); +			print_verifier_state(env, state);  			do_print_state = false;  		} -		if (log_level) { -			verbose("%d: ", insn_idx); -			print_bpf_insn(env, insn); +		if (env->log.level) { +			verbose(env, "%d: ", insn_idx); +			print_bpf_insn(verbose, env, insn, +				       env->allow_ptr_leaks);  		}  		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);  		if (err)  			return err; +		regs = cur_regs(env); +		env->insn_aux_data[insn_idx].seen = true;  		if (class == BPF_ALU || class == BPF_ALU64) {  			err = check_alu_op(env, insn);  			if (err) @@ -3672,7 +3877,7 @@ static int do_check(struct bpf_verifier_env *env)  				 * src_reg == stack|map in some other branch.  				 * Reject it.  				 */ -				verbose("same insn cannot be used with different pointers\n"); +				verbose(env, "same insn cannot be used with different pointers\n");  				return -EINVAL;  			} @@ -3712,14 +3917,14 @@ static int do_check(struct bpf_verifier_env *env)  			} else if (dst_reg_type != *prev_dst_type &&  				   (dst_reg_type == PTR_TO_CTX ||  				    *prev_dst_type == PTR_TO_CTX)) { -				verbose("same insn cannot be used with different pointers\n"); +				verbose(env, "same insn cannot be used with different pointers\n");  				return -EINVAL;  			}  		} else if (class == BPF_ST) {  			if (BPF_MODE(insn->code) != BPF_MEM ||  			    insn->src_reg != BPF_REG_0) { -				verbose("BPF_ST uses reserved fields\n"); +				verbose(env, "BPF_ST uses reserved fields\n");  				return -EINVAL;  			}  			/* check src operand */ @@ -3742,7 +3947,7 @@ static int do_check(struct bpf_verifier_env *env)  				    insn->off != 0 ||  				    insn->src_reg != BPF_REG_0 ||  				    insn->dst_reg != BPF_REG_0) { -					verbose("BPF_CALL uses reserved fields\n"); +					verbose(env, "BPF_CALL uses reserved fields\n");  					return -EINVAL;  				} @@ -3755,7 +3960,7 @@ static int do_check(struct bpf_verifier_env *env)  				    insn->imm != 0 ||  				    insn->src_reg != BPF_REG_0 ||  				    insn->dst_reg != BPF_REG_0) { -					verbose("BPF_JA uses reserved fields\n"); +					verbose(env, "BPF_JA uses reserved fields\n");  					return -EINVAL;  				} @@ -3767,7 +3972,7 @@ static int do_check(struct bpf_verifier_env *env)  				    insn->imm != 0 ||  				    insn->src_reg != BPF_REG_0 ||  				    insn->dst_reg != BPF_REG_0) { -					verbose("BPF_EXIT uses reserved fields\n"); +					verbose(env, "BPF_EXIT uses reserved fields\n");  					return -EINVAL;  				} @@ -3782,13 +3987,18 @@ static int do_check(struct bpf_verifier_env *env)  					return err;  				if (is_pointer_value(env, BPF_REG_0)) { -					verbose("R0 leaks addr as return value\n"); +					verbose(env, "R0 leaks addr as return value\n");  					return -EACCES;  				} +				err = check_return_code(env); +				if (err) +					return err;  process_bpf_exit: -				insn_idx = pop_stack(env, &prev_insn_idx); -				if (insn_idx < 0) { +				err = pop_stack(env, &prev_insn_idx, &insn_idx); +				if (err < 0) { +					if (err != -ENOENT) +						return err;  					break;  				} else {  					do_print_state = true; @@ -3813,20 +4023,21 @@ process_bpf_exit:  					return err;  				insn_idx++; +				env->insn_aux_data[insn_idx].seen = true;  			} else { -				verbose("invalid BPF_LD mode\n"); +				verbose(env, "invalid BPF_LD mode\n");  				return -EINVAL;  			}  		} else { -			verbose("unknown insn class %d\n", class); +			verbose(env, "unknown insn class %d\n", class);  			return -EINVAL;  		}  		insn_idx++;  	} -	verbose("processed %d insns, stack depth %d\n", -		insn_processed, env->prog->aux->stack_depth); +	verbose(env, "processed %d insns, stack depth %d\n", insn_processed, +		env->prog->aux->stack_depth);  	return 0;  } @@ -3838,7 +4049,8 @@ static int check_map_prealloc(struct bpf_map *map)  		!(map->map_flags & BPF_F_NO_PREALLOC);  } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, +					struct bpf_map *map,  					struct bpf_prog *prog)  { @@ -3849,12 +4061,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,  	 */  	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {  		if (!check_map_prealloc(map)) { -			verbose("perf_event programs can only use preallocated hash map\n"); +			verbose(env, "perf_event programs can only use preallocated hash map\n");  			return -EINVAL;  		}  		if (map->inner_map_meta &&  		    !check_map_prealloc(map->inner_map_meta)) { -			verbose("perf_event programs can only use preallocated inner hash map\n"); +			verbose(env, "perf_event programs can only use preallocated inner hash map\n");  			return -EINVAL;  		}  	} @@ -3877,14 +4089,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  	for (i = 0; i < insn_cnt; i++, insn++) {  		if (BPF_CLASS(insn->code) == BPF_LDX &&  		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { -			verbose("BPF_LDX uses reserved fields\n"); +			verbose(env, "BPF_LDX uses reserved fields\n");  			return -EINVAL;  		}  		if (BPF_CLASS(insn->code) == BPF_STX &&  		    ((BPF_MODE(insn->code) != BPF_MEM &&  		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { -			verbose("BPF_STX uses reserved fields\n"); +			verbose(env, "BPF_STX uses reserved fields\n");  			return -EINVAL;  		} @@ -3895,7 +4107,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  			if (i == insn_cnt - 1 || insn[1].code != 0 ||  			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||  			    insn[1].off != 0) { -				verbose("invalid bpf_ld_imm64 insn\n"); +				verbose(env, "invalid bpf_ld_imm64 insn\n");  				return -EINVAL;  			} @@ -3904,19 +4116,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  				goto next_insn;  			if (insn->src_reg != BPF_PSEUDO_MAP_FD) { -				verbose("unrecognized bpf_ld_imm64 insn\n"); +				verbose(env, +					"unrecognized bpf_ld_imm64 insn\n");  				return -EINVAL;  			}  			f = fdget(insn->imm);  			map = __bpf_map_get(f);  			if (IS_ERR(map)) { -				verbose("fd %d is not pointing to valid bpf_map\n", +				verbose(env, "fd %d is not pointing to valid bpf_map\n",  					insn->imm);  				return PTR_ERR(map);  			} -			err = check_map_prog_compatibility(map, env->prog); +			err = check_map_prog_compatibility(env, map, env->prog);  			if (err) {  				fdput(f);  				return err; @@ -3993,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,  				u32 off, u32 cnt)  {  	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; +	int i;  	if (cnt == 1)  		return 0; @@ -4002,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,  	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);  	memcpy(new_data + off + cnt - 1, old_data + off,  	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); +	for (i = off; i < off + cnt - 1; i++) +		new_data[i].seen = true;  	env->insn_aux_data = new_data;  	vfree(old_data);  	return 0; @@ -4020,12 +4236,31 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of  	return new_prog;  } +/* The verifier does more data flow analysis than llvm and will not explore + * branches that are dead at run time. Malicious programs can have dead code + * too. Therefore replace all dead at-run-time code with nops. + */ +static void sanitize_dead_code(struct bpf_verifier_env *env) +{ +	struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); +	struct bpf_insn *insn = env->prog->insnsi; +	const int insn_cnt = env->prog->len; +	int i; + +	for (i = 0; i < insn_cnt; i++) { +		if (aux_data[i].seen) +			continue; +		memcpy(insn + i, &nop, sizeof(nop)); +	} +} +  /* convert load instructions that access fields of 'struct __sk_buff'   * into sequence of instructions that access fields of 'struct sk_buff'   */  static int convert_ctx_accesses(struct bpf_verifier_env *env)  { -	const struct bpf_verifier_ops *ops = env->prog->aux->ops; +	const struct bpf_verifier_ops *ops = env->ops;  	int i, cnt, size, ctx_field_size, delta = 0;  	const int insn_cnt = env->prog->len;  	struct bpf_insn insn_buf[16], *insn; @@ -4038,7 +4273,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,  					env->prog);  		if (cnt >= ARRAY_SIZE(insn_buf)) { -			verbose("bpf verifier is misconfigured\n"); +			verbose(env, "bpf verifier is misconfigured\n");  			return -EINVAL;  		} else if (cnt) {  			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4086,7 +4321,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			u8 size_code;  			if (type == BPF_WRITE) { -				verbose("bpf verifier narrow ctx access misconfigured\n"); +				verbose(env, "bpf verifier narrow ctx access misconfigured\n");  				return -EINVAL;  			} @@ -4105,7 +4340,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  					      &target_size);  		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||  		    (ctx_field_size && !target_size)) { -			verbose("bpf verifier is misconfigured\n"); +			verbose(env, "bpf verifier is misconfigured\n");  			return -EINVAL;  		} @@ -4187,7 +4422,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);  			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { -				verbose("bpf verifier is misconfigured\n"); +				verbose(env, "bpf verifier is misconfigured\n");  				return -EINVAL;  			} @@ -4226,12 +4461,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			insn      = new_prog->insnsi + i + delta;  		}  patch_call_imm: -		fn = prog->aux->ops->get_func_proto(insn->imm); +		fn = env->ops->get_func_proto(insn->imm);  		/* all functions that have prototype and verifier allowed  		 * programs to call them, must be real in-kernel functions  		 */  		if (!fn->func) { -			verbose("kernel subsystem misconfigured func %s#%d\n", +			verbose(env, +				"kernel subsystem misconfigured func %s#%d\n",  				func_id_name(insn->imm), insn->imm);  			return -EFAULT;  		} @@ -4255,6 +4491,7 @@ static void free_states(struct bpf_verifier_env *env)  		if (sl)  			while (sl != STATE_LIST_MARK) {  				sln = sl->next; +				free_verifier_state(&sl->state, false);  				kfree(sl);  				sl = sln;  			} @@ -4265,16 +4502,21 @@ static void free_states(struct bpf_verifier_env *env)  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  { -	char __user *log_ubuf = NULL;  	struct bpf_verifier_env *env; +	struct bpf_verifer_log *log;  	int ret = -EINVAL; +	/* no program is valid */ +	if (ARRAY_SIZE(bpf_verifier_ops) == 0) +		return -EINVAL; +  	/* 'struct bpf_verifier_env' can be global, but since it's not small,  	 * allocate/free it every time bpf_check() is called  	 */  	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);  	if (!env)  		return -ENOMEM; +	log = &env->log;  	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *  				     (*prog)->len); @@ -4282,6 +4524,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  	if (!env->insn_aux_data)  		goto err_free_env;  	env->prog = *prog; +	env->ops = bpf_verifier_ops[env->prog->type];  	/* grab the mutex to protect few globals used by verifier */  	mutex_lock(&bpf_verifier_lock); @@ -4290,29 +4533,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  		/* user requested verbose verifier output  		 * and supplied buffer to store the verification trace  		 */ -		log_level = attr->log_level; -		log_ubuf = (char __user *) (unsigned long) attr->log_buf; -		log_size = attr->log_size; -		log_len = 0; +		log->level = attr->log_level; +		log->ubuf = (char __user *) (unsigned long) attr->log_buf; +		log->len_total = attr->log_size;  		ret = -EINVAL; -		/* log_* values have to be sane */ -		if (log_size < 128 || log_size > UINT_MAX >> 8 || -		    log_level == 0 || log_ubuf == NULL) -			goto err_unlock; - -		ret = -ENOMEM; -		log_buf = vmalloc(log_size); -		if (!log_buf) +		/* log attributes have to be sane */ +		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || +		    !log->level || !log->ubuf)  			goto err_unlock; -	} else { -		log_level = 0;  	}  	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);  	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))  		env->strict_alignment = true; +	if (env->prog->aux->offload) { +		ret = bpf_prog_offload_verifier_prep(env); +		if (ret) +			goto err_unlock; +	} +  	ret = replace_map_fd_with_map_ptr(env);  	if (ret < 0)  		goto skip_full_check; @@ -4331,29 +4572,30 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);  	ret = do_check(env); +	if (env->cur_state) { +		free_verifier_state(env->cur_state, true); +		env->cur_state = NULL; +	}  skip_full_check: -	while (pop_stack(env, NULL) >= 0); +	while (!pop_stack(env, NULL, NULL));  	free_states(env);  	if (ret == 0) +		sanitize_dead_code(env); + +	if (ret == 0)  		/* program is valid, convert *(u32*)(ctx + off) accesses */  		ret = convert_ctx_accesses(env);  	if (ret == 0)  		ret = fixup_bpf_calls(env); -	if (log_level && log_len >= log_size - 1) { -		BUG_ON(log_len >= log_size); -		/* verifier log exceeded user supplied buffer */ +	if (log->level && bpf_verifier_log_full(log))  		ret = -ENOSPC; -		/* fall through to return what was recorded */ -	} - -	/* copy verifier log back to user space including trailing zero */ -	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { +	if (log->level && !log->ubuf) {  		ret = -EFAULT; -		goto free_log_buf; +		goto err_release_maps;  	}  	if (ret == 0 && env->used_map_cnt) { @@ -4364,7 +4606,7 @@ skip_full_check:  		if (!env->prog->aux->used_maps) {  			ret = -ENOMEM; -			goto free_log_buf; +			goto err_release_maps;  		}  		memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4377,9 +4619,7 @@ skip_full_check:  		convert_pseudo_ld_imm64(env);  	} -free_log_buf: -	if (log_level) -		vfree(log_buf); +err_release_maps:  	if (!env->prog->aux->used_maps)  		/* if we didn't copy map pointers into bpf_prog_info, release  		 * them now. Otherwise free_bpf_prog_info() will release them. @@ -4393,58 +4633,3 @@ err_free_env:  	kfree(env);  	return ret;  } - -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, -		 void *priv) -{ -	struct bpf_verifier_env *env; -	int ret; - -	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); -	if (!env) -		return -ENOMEM; - -	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * -				     prog->len); -	ret = -ENOMEM; -	if (!env->insn_aux_data) -		goto err_free_env; -	env->prog = prog; -	env->analyzer_ops = ops; -	env->analyzer_priv = priv; - -	/* grab the mutex to protect few globals used by verifier */ -	mutex_lock(&bpf_verifier_lock); - -	log_level = 0; - -	env->strict_alignment = false; -	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) -		env->strict_alignment = true; - -	env->explored_states = kcalloc(env->prog->len, -				       sizeof(struct bpf_verifier_state_list *), -				       GFP_KERNEL); -	ret = -ENOMEM; -	if (!env->explored_states) -		goto skip_full_check; - -	ret = check_cfg(env); -	if (ret < 0) -		goto skip_full_check; - -	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - -	ret = do_check(env); - -skip_full_check: -	while (pop_stack(env, NULL) >= 0); -	free_states(env); - -	mutex_unlock(&bpf_verifier_lock); -	vfree(env->insn_aux_data); -err_free_env: -	kfree(env); -	return ret; -} -EXPORT_SYMBOL_GPL(bpf_analyzer);  | 

