diff options
-rw-r--r-- | include/linux/cgroup.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 12 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 43 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 3 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 8 | ||||
-rw-r--r-- | kernel/cgroup.c | 35 | ||||
-rw-r--r-- | net/core/filter.c | 38 | ||||
-rw-r--r-- | samples/bpf/Makefile | 3 | ||||
-rw-r--r-- | samples/bpf/bpf_helpers.h | 2 | ||||
-rw-r--r-- | samples/bpf/test_cgrp2_array_pin.c | 109 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_tc.sh | 184 | ||||
-rw-r--r-- | samples/bpf/test_cgrp2_tc_kern.c | 69 |
12 files changed, 506 insertions, 1 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a20320c666fd..984f73b719a9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -87,6 +87,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); +struct cgroup *cgroup_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index be6ac1291680..f44504d875e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, }; enum bpf_prog_type { @@ -336,6 +337,17 @@ enum bpf_func_id { */ BPF_FUNC_skb_change_type, + /** + * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + */ + BPF_FUNC_skb_in_cgroup, __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4ec57a649b1f..db1a743e3db2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void) return 0; } late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int fd) +{ + return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ + /* cgroup_put free cgrp after a rcu grace period */ + cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = cgroup_fd_array_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = cgroup_fd_array_get_ptr, + .map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { + .ops = &cgroup_array_ops, + .type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ + bpf_register_map_type(&cgroup_array_type); + return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 22863d9872b1..96d938a22050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eec9f90ba030..e206c2181412 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1035,6 +1035,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_get_stackid) goto error; break; + case BPF_MAP_TYPE_CGROUP_ARRAY: + if (func_id != BPF_FUNC_skb_in_cgroup) + goto error; + break; default: break; } @@ -1054,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_skb_in_cgroup: + if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) + goto error; + break; default: break; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..50787cd61da2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -62,6 +62,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> +#include <linux/file.h> #include <net/sock.h> /* @@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. diff --git a/net/core/filter.c b/net/core/filter.c index 76fee35da244..54071cf70fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2239,6 +2239,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } +#ifdef CONFIG_SOCK_CGROUP_DATA +static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long)r1; + struct bpf_map *map = (struct bpf_map *)(long)r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + u32 i = (u32)r3; + + sk = skb->sk; + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + + if (unlikely(i >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[i]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); +} + +static const struct bpf_func_proto bpf_skb_in_cgroup_proto = { + .func = bpf_skb_in_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; +#endif + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2307,6 +2341,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return bpf_get_event_output_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_in_cgroup: + return &bpf_skb_in_cgroup_proto; +#endif default: return sk_filter_func_proto(func_id); } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0bf2478cb7df..a98b780e974c 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -20,6 +20,7 @@ hostprogs-y += offwaketime hostprogs-y += spintest hostprogs-y += map_perf_test hostprogs-y += test_overhead +hostprogs-y += test_cgrp2_array_pin test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -40,6 +41,7 @@ offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o +test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -61,6 +63,7 @@ always += map_perf_test_kern.o always += test_overhead_tp_kern.o always += test_overhead_kprobe_kern.o always += parse_varlen.o parse_simple.o parse_ldabs.o +always += test_cgrp2_tc_kern.o HOSTCFLAGS += -I$(objtree)/usr/include diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 7904a2a493de..84e3fd919a06 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -70,6 +70,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag (void *) BPF_FUNC_l3_csum_replace; static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = (void *) BPF_FUNC_l4_csum_replace; +static int (*bpf_skb_in_cgroup)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_skb_in_cgroup; #if defined(__x86_64__) diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c new file mode 100644 index 000000000000..70e86f7be69d --- /dev/null +++ b/samples/bpf/test_cgrp2_array_pin.c @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> + +#include "libbpf.h" + +static void usage(void) +{ + printf("Usage: test_cgrp2_array_pin [...]\n"); + printf(" -F <file> File to pin an BPF cgroup array\n"); + printf(" -U <file> Update an already pinned BPF cgroup array\n"); + printf(" -v <value> Full path of the cgroup2\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + const char *pinned_file = NULL, *cg2 = NULL; + int create_array = 1; + int array_key = 0; + int array_fd = -1; + int cg2_fd = -1; + int ret = -1; + int opt; + + while ((opt = getopt(argc, argv, "F:U:v:")) != -1) { + switch (opt) { + /* General args */ + case 'F': + pinned_file = optarg; + break; + case 'U': + pinned_file = optarg; + create_array = 0; + break; + case 'v': + cg2 = optarg; + break; + default: + usage(); + goto out; + } + } + + if (!cg2 || !pinned_file) { + usage(); + goto out; + } + + cg2_fd = open(cg2, O_RDONLY); + if (cg2_fd < 0) { + fprintf(stderr, "open(%s,...): %s(%d)\n", + cg2, strerror(errno), errno); + goto out; + } + + if (create_array) { + array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, + sizeof(uint32_t), sizeof(uint32_t), + 1, 0); + if (array_fd < 0) { + fprintf(stderr, + "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", + strerror(errno), errno); + goto out; + } + } else { + array_fd = bpf_obj_get(pinned_file); + if (array_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + + ret = bpf_update_elem(array_fd, &array_key, &cg2_fd, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + if (create_array) { + ret = bpf_obj_pin(array_fd, pinned_file); + if (ret) { + fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + +out: + if (array_fd != -1) + close(array_fd); + if (cg2_fd != -1) + close(cg2_fd); + return ret; +} diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh new file mode 100755 index 000000000000..0b119eeaf85c --- /dev/null +++ b/samples/bpf/test_cgrp2_tc.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +MY_DIR=$(dirname $0) +# Details on the bpf prog +BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin' +BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o" +BPF_SECTION='filter' + +[ -z "$TC" ] && TC='tc' +[ -z "$IP" ] && IP='ip' + +# Names of the veth interface, net namespace...etc. +HOST_IFC='ve' +NS_IFC='vens' +NS='ns' + +find_mnt() { + cat /proc/mounts | \ + awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }' +} + +# Init cgroup2 vars +init_cgrp2_vars() { + CGRP2_ROOT=$(find_mnt cgroup2) + if [ -z "$CGRP2_ROOT" ] + then + CGRP2_ROOT='/mnt/cgroup2' + MOUNT_CGRP2="yes" + fi + CGRP2_TC="$CGRP2_ROOT/tc" + CGRP2_TC_LEAF="$CGRP2_TC/leaf" +} + +# Init bpf fs vars +init_bpf_fs_vars() { + local bpf_fs_root=$(find_mnt bpf) + [ -n "$bpf_fs_root" ] || return -1 + BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals" +} + +setup_cgrp2() { + case $1 in + start) + if [ "$MOUNT_CGRP2" == 'yes' ] + then + [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT + mount -t cgroup2 none $CGRP2_ROOT || return $? + fi + mkdir -p $CGRP2_TC_LEAF + ;; + *) + rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC + [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT + ;; + esac +} + +setup_bpf_cgrp2_array() { + local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME" + case $1 in + start) + $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC + ;; + *) + [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array + ;; + esac +} + +setup_net() { + case $1 in + start) + $IP link add $HOST_IFC type veth peer name $NS_IFC || return $? + $IP link set dev $HOST_IFC up || return $? + sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0 + + $IP netns add ns || return $? + $IP link set dev $NS_IFC netns ns || return $? + $IP -n $NS link set dev $NS_IFC up || return $? + $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0 + $TC qdisc add dev $HOST_IFC clsact || return $? + $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $? + ;; + *) + $IP netns del $NS + $IP link del $HOST_IFC + ;; + esac +} + +run_in_cgrp() { + # Fork another bash and move it under the specified cgroup. + # It makes the cgroup cleanup easier at the end of the test. + cmd='echo $$ > ' + cmd="$cmd $1/cgroup.procs; exec $2" + bash -c "$cmd" +} + +do_test() { + run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null" + local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \ + awk '/drop/{print substr($7, 0, index($7, ",")-1)}') + if [[ $dropped -eq 0 ]] + then + echo "FAIL" + return 1 + else + echo "Successfully filtered $dropped packets" + return 0 + fi +} + +do_exit() { + if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ] + then + echo "------ DEBUG ------" + echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo + echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo + if [ -d "$BPF_FS_TC_SHARE" ] + then + echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo + fi + echo "Host net:" + $IP netns + $IP link show dev $HOST_IFC + $IP -6 a show dev $HOST_IFC + $TC -s qdisc show dev $HOST_IFC + echo + echo "$NS net:" + $IP -n $NS link show dev $NS_IFC + $IP -n $NS -6 link show dev $NS_IFC + echo "------ DEBUG ------" + echo + fi + + if [ "$MODE" != 'nocleanup' ] + then + setup_net stop + setup_bpf_cgrp2_array stop + setup_cgrp2 stop + fi +} + +init_cgrp2_vars +init_bpf_fs_vars + +while [[ $# -ge 1 ]] +do + a="$1" + case $a in + debug) + DEBUG='yes' + shift 1 + ;; + cleanup-only) + MODE='cleanuponly' + shift 1 + ;; + no-cleanup) + MODE='nocleanup' + shift 1 + ;; + *) + echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]" + echo " debug: Print cgrp and network setup details at the end of the test" + echo " cleanup-only: Try to cleanup things from last test. No test will be run" + echo " no-cleanup: Run the test but don't do cleanup at the end" + echo "[Note: If no arg is given, it will run the test and do cleanup at the end]" + echo + exit -1 + ;; + esac +done + +trap do_exit 0 + +[ "$MODE" == 'cleanuponly' ] && exit + +setup_cgrp2 start || exit $? +setup_net start || exit $? +init_bpf_fs_vars || exit $? +setup_bpf_cgrp2_array start || exit $? +do_test +echo diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c new file mode 100644 index 000000000000..2732c37c8d5b --- /dev/null +++ b/samples/bpf/test_cgrp2_tc_kern.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/if_ether.h> +#include <uapi/linux/in6.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/pkt_cls.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +#define PIN_GLOBAL_NS 2 +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = { + .type = BPF_MAP_TYPE_CGROUP_ARRAY, + .size_key = sizeof(uint32_t), + .size_value = sizeof(uint32_t), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +SEC("filter") +int handle_egress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + struct ipv6hdr *ip6h = data + sizeof(*eth); + void *data_end = (void *)(long)skb->data_end; + char dont_care_msg[] = "dont care %04x %d\n"; + char pass_msg[] = "pass\n"; + char reject_msg[] = "reject\n"; + + /* single length check */ + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (eth->h_proto != htons(ETH_P_IPV6) || + ip6h->nexthdr != IPPROTO_ICMPV6) { + bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), + eth->h_proto, ip6h->nexthdr); + return TC_ACT_OK; + } else if (bpf_skb_in_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { + bpf_trace_printk(pass_msg, sizeof(pass_msg)); + return TC_ACT_OK; + } else { + bpf_trace_printk(reject_msg, sizeof(reject_msg)); + return TC_ACT_SHOT; + } +} + +char _license[] SEC("license") = "GPL"; |