From cd67cd5eb25ae9a7bafbfd3d52d4c05e1d80af3b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Feb 2015 09:44:44 +0200 Subject: ipvs: use 64-bit rates in stats IPVS stats are limited to 2^(32-10) conns/s and packets/s, 2^(32-5) bytes/s. It is time to use 64 bits: * Change all conn/packet kernel counters to 64-bit and update them in u64_stats_update_{begin,end} section * In kernel use struct ip_vs_kstats instead of the user-space struct ip_vs_stats_user and use new func ip_vs_export_stats_user to export it to sockopt users to preserve compatibility with 32-bit values * Rename cpu counters "ustats" to "cnt" * To netlink users provide additionally 64-bit stats: IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. Old stats remain for old binaries. * We can use ip_vs_copy_stats in ip_vs_stats_percpu_show Thanks to Chris Caputo for providing initial patch for ip_vs_est.c Signed-off-by: Chris Caputo Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/uapi/linux/ip_vs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index cabe95d5b461..3199243f2028 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -358,6 +358,8 @@ enum { IPVS_SVC_ATTR_PE_NAME, /* name of ct retriever */ + IPVS_SVC_ATTR_STATS64, /* nested attribute for service stats */ + __IPVS_SVC_ATTR_MAX, }; @@ -387,6 +389,8 @@ enum { IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */ + __IPVS_DEST_ATTR_MAX, }; @@ -410,7 +414,8 @@ enum { /* * Attributes used to describe service or destination entry statistics * - * Used inside nested attributes IPVS_SVC_ATTR_STATS and IPVS_DEST_ATTR_STATS + * Used inside nested attributes IPVS_SVC_ATTR_STATS, IPVS_DEST_ATTR_STATS, + * IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. */ enum { IPVS_STATS_ATTR_UNSPEC = 0, -- cgit v1.2.3 From 93a714d6b53d87872e552dbb273544bdeaaf6e12 Mon Sep 17 00:00:00 2001 From: Madhu Challa Date: Wed, 25 Feb 2015 09:58:35 -0800 Subject: multicast: Extend ip address command to enable multicast group join/leave on Joining multicast group on ethernet level via "ip maddr" command would not work if we have an Ethernet switch that does igmp snooping since the switch would not replicate multicast packets on ports that did not have IGMP reports for the multicast addresses. Linux vxlan interfaces created via "ip link add vxlan" have the group option that enables then to do the required join. By extending ip address command with option "autojoin" we can get similar functionality for openvswitch vxlan interfaces as well as other tunneling mechanisms that need to receive multicast traffic. The kernel code is structured similar to how the vxlan driver does a group join / leave. example: ip address add 224.1.1.10/24 dev eth5 autojoin ip address del 224.1.1.10/24 dev eth5 Signed-off-by: Madhu Challa Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/netns/ipv6.h | 1 + include/uapi/linux/if_addr.h | 1 + net/ipv4/devinet.c | 31 +++++++++++++++++++++++++++++++ net/ipv4/igmp.c | 13 +++++++++++++ net/ipv6/addrconf.c | 38 +++++++++++++++++++++++++++++++++++--- net/ipv6/mcast.c | 20 ++++++++++++++++---- 7 files changed, 98 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index dbe225478adb..1b26c6c3fd7c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { struct sock *fibnl; struct sock * __percpu *icmp_sk; + struct sock *mc_autojoin_sk; struct inet_peer_base *peers; struct tcpm_hash_bucket *tcp_metrics_hash; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 69ae41f2098c..ca0db12cd089 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -67,6 +67,7 @@ struct netns_ipv6 { struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; + struct sock *mc_autojoin_sk; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr6_table *mrt6; diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index dea10a87dfd1..40fdfea39714 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -50,6 +50,7 @@ enum { #define IFA_F_PERMANENT 0x80 #define IFA_F_MANAGETEMPADDR 0x100 #define IFA_F_NOPREFIXROUTE 0x200 +#define IFA_F_MCAUTOJOIN 0x400 struct ifa_cacheinfo { __u32 ifa_prefered; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3a8985c94581..5105759e4e00 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } +static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +{ + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ifa->ifa_address, + .imr_ifindex = ifa->ifa_dev->dev->ifindex, + }; + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = __ip_mc_join_group(sk, &mreq); + else + ret = __ip_mc_leave_group(sk, &mreq); + release_sock(sk); + + return ret; +} + static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -584,6 +604,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) continue; + if (ipv4_is_multicast(ifa->ifa_address)) + ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); + if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, + true, ifa); + + if (ret < 0) { + inet_free_ifa(ifa); + return ret; + } + } return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); } else { inet_free_ifa(ifa); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4b1172d73e03..5cb1ef4ce292 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #ifdef CONFIG_IP_MROUTE #include @@ -2740,6 +2741,7 @@ static const struct file_operations igmp_mcf_seq_fops = { static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; + int err; pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) @@ -2748,8 +2750,18 @@ static int __net_init igmp_net_init(struct net *net) &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; + err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, + SOCK_DGRAM, 0, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", + err); + goto out_sock; + } + return 0; +out_sock: + remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: @@ -2760,6 +2772,7 @@ static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); + inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98e4a63d72bb..783bccfcc060 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2464,6 +2464,23 @@ err_exit: return err; } +static int ipv6_mc_config(struct sock *sk, bool join, + const struct in6_addr *addr, int ifindex) +{ + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = __ipv6_sock_mc_join(sk, ifindex, addr); + else + ret = __ipv6_sock_mc_drop(sk, ifindex, addr); + release_sock(sk); + + return ret; +} + /* * Manual configuration of address on an interface */ @@ -2476,10 +2493,10 @@ static int inet6_addr_add(struct net *net, int ifindex, struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; + unsigned long timeout; + clock_t expires; int scope; u32 flags; - clock_t expires; - unsigned long timeout; ASSERT_RTNL(); @@ -2501,6 +2518,14 @@ static int inet6_addr_add(struct net *net, int ifindex, if (IS_ERR(idev)) return PTR_ERR(idev); + if (ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, + true, pfx, ifindex); + + if (ret < 0) + return ret; + } + scope = ipv6_addr_scope(pfx); timeout = addrconf_timeout_fixup(valid_lft, HZ); @@ -2542,6 +2567,9 @@ static int inet6_addr_add(struct net *net, int ifindex, in6_ifa_put(ifp); addrconf_verify_rtnl(); return 0; + } else if (ifa_flags & IFA_F_MCAUTOJOIN) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, ifindex); } return PTR_ERR(ifp); @@ -2578,6 +2606,10 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, jiffies); ipv6_del_addr(ifp); addrconf_verify_rtnl(); + if (ipv6_addr_is_multicast(pfx)) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, dev->ifindex); + } return 0; } } @@ -3945,7 +3977,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) /* We ignore other flags so far. */ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE; + IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index e4955d019734..1dd1fedff9f4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2929,20 +2929,32 @@ static int __net_init igmp6_net_init(struct net *net) inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", + err); + goto out_sock_create; + } + err = igmp6_proc_init(net); if (err) - goto out_sock_create; -out: - return err; + goto out_sock_create_autojoin; + + return 0; +out_sock_create_autojoin: + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); - goto out; +out: + return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } -- cgit v1.2.3 From 31f909a2c0abfc1a1a76b2981d28ac85d33210e7 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 24 Feb 2015 22:42:16 +0900 Subject: nl/mac80211: allow zero plink timeout to disable STA expiration Both wpa_supplicant and mac80211 have and inactivity timer. By default wpa_supplicant will be timed out in 5 minutes and mac80211's it is 30 minutes. If wpa_supplicant uses a longer timer than mac80211, it will get unexpected disconnection by mac80211. Using 0xffffffff instead as the configured value could solve this w/o changing the code, but due to integer overflow in the expression used this doesn't work. The expression is: (current jiffies) > (frame Rx jiffies + NL80211_MESHCONF_PLINK_TIMEOUT * 250) On 32bit system, the right side would overflow and be a very small value if NL80211_MESHCONF_PLINK_TIMEOUT is sufficiently large, causing unexpectedly early disconnections. Instead allow disabling the inactivity timer to avoid this situation, by passing the (previously invalid and useless) value 0. Signed-off-by: Masashi Honma [reword/rewrap commit log] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 ++- net/mac80211/mesh.c | 3 ++- net/wireless/nl80211.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 68b294e83944..2dcf9bba317c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3092,7 +3092,8 @@ enum nl80211_mesh_power_mode { * * @NL80211_MESHCONF_PLINK_TIMEOUT: If no tx activity is seen from a STA we've * established peering with for longer than this time (in seconds), then - * remove it from the STA's list of peers. Default is 30 minutes. + * remove it from the STA's list of peers. You may set this to 0 to disable + * the removal of the STA. Default is 30 minutes. * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 0c8b2a77d312..acf441ff9f4a 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -574,7 +574,8 @@ static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 changed; - ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); + if (ifmsh->mshcfg.plink_timeout > 0) + ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); mesh_path_expire(sdata); changed = mesh_accept_plinks_update(sdata); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d78fd8b54515..9c6e23ede5b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5265,7 +5265,7 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 1, 0xffffffff, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); if (mask_out) -- cgit v1.2.3 From f1a66f85b74c5ef7b503f746ea97742dacd56419 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:43 +0100 Subject: ebpf: export BPF_PSEUDO_MAP_FD to uapi We need to export BPF_PSEUDO_MAP_FD to user space, as it's used in the ELF BPF loader where instructions are being loaded that need map fixups. An initial stage loads all maps into the kernel, and later on replaces related instructions in the eBPF blob with BPF_PSEUDO_MAP_FD as source register and the actual fd as immediate value. The kernel verifier recognizes this keyword and replaces the map fd with a real pointer internally. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 -- include/uapi/linux/bpf.h | 2 ++ samples/bpf/libbpf.h | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index caac2087a4d5..5e3863d5f666 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,8 +145,6 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) -#define BPF_PSEUDO_MAP_FD 1 - /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..0248180bf2e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,6 +120,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCKET_FILTER, }; +#define BPF_PSEUDO_MAP_FD 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index 58c5fe1bdba1..a6bb7e9c22c3 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -92,7 +92,9 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) -#define BPF_PSEUDO_MAP_FD 1 +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ -- cgit v1.2.3 From 96be4325f443dbbfeb37d2a157675ac0736531a1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:46 +0100 Subject: ebpf: add sched_cls_type and map it to sk_filter's verifier ops As discussed recently and at netconf/netdev01, we want to prevent making bpf_verifier_ops registration available for modules, but have them at a controlled place inside the kernel instead. The reason for this is, that out-of-tree modules can go crazy and define and register any verfifier ops they want, doing all sorts of crap, even bypassing available GPLed eBPF helper functions. We don't want to offer such a shiny playground, of course, but keep strict control to ourselves inside the core kernel. This also encourages us to design eBPF user helpers carefully and generically, so they can be shared among various subsystems using eBPF. For the eBPF traffic classifier (cls_bpf), it's a good start to share the same helper facilities as we currently do in eBPF for socket filters. That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus one day if there's a good reason to diverge the set of helper functions from the set available to socket filters, we keep ABI compatibility. In future, we could place all bpf_prog_type_list at a central place, perhaps. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0248180bf2e2..3fa1af8a58d7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_SCHED_CLS, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..594d341f04db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,17 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + return true; + default: + return false; + } +} + /* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, @@ -1194,8 +1205,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + if (!may_access_skb(env->prog->aux->prog_type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } diff --git a/net/core/filter.c b/net/core/filter.c index 741721233166..514d4082f326 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1166,9 +1166,16 @@ static struct bpf_prog_type_list sk_filter_type __read_mostly = { .type = BPF_PROG_TYPE_SOCKET_FILTER, }; +static struct bpf_prog_type_list sched_cls_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); + bpf_register_prog_type(&sched_cls_type); + return 0; } late_initcall(register_sk_filter_ops); -- cgit v1.2.3 From e2e9b6541dd4b31848079da80fe2253daaafb549 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:48 +0100 Subject: cls_bpf: add initial eBPF support for programmable classifiers This work extends the "classic" BPF programmable tc classifier by extending its scope also to native eBPF code! This allows for user space to implement own custom, 'safe' C like classifiers (or whatever other frontend language LLVM et al may provide in future), that can then be compiled with the LLVM eBPF backend to an eBPF elf file. The result of this can be loaded into the kernel via iproute2's tc. In the kernel, they can be JITed on major archs and thus run in native performance. Simple, minimal toy example to demonstrate the workflow: #include #include #include #include "tc_bpf_api.h" __section("classify") int cls_main(struct sk_buff *skb) { return (0x800 << 16) | load_byte(skb, ETH_HLEN + __builtin_offsetof(struct iphdr, tos)); } char __license[] __section("license") = "GPL"; The classifier can then be compiled into eBPF opcodes and loaded via tc, for example: clang -O2 -emit-llvm -c cls.c -o - | llc -march=bpf -filetype=obj -o cls.o tc filter add dev em1 parent 1: bpf cls.o [...] As it has been demonstrated, the scope can even reach up to a fully fledged flow dissector (similarly as in samples/bpf/sockex2_kern.c). For tc, maps are allowed to be used, but from kernel context only, in other words, eBPF code can keep state across filter invocations. In future, we perhaps may reattach from a different application to those maps e.g., to read out collected statistics/state. Similarly as in socket filters, we may extend functionality for eBPF classifiers over time depending on the use cases. For that purpose, cls_bpf programs are using BPF_PROG_TYPE_SCHED_CLS program type, so we can allow additional functions/accessors (e.g. an ABI compatible offset translation to skb fields/metadata). For an initial cls_bpf support, we allow the same set of helper functions as eBPF socket filters, but we could diverge at some point in time w/o problem. I was wondering whether cls_bpf and act_bpf could share C programs, I can imagine that at some point, we introduce i) further common handlers for both (or even beyond their scope), and/or if truly needed ii) some restricted function space for each of them. Both can be abstracted easily through struct bpf_verifier_ops in future. The context of cls_bpf versus act_bpf is slightly different though: a cls_bpf program will return a specific classid whereas act_bpf a drop/non-drop return code, latter may also in future mangle skbs. That said, we can surely have a "classify" and "action" section in a single object file, or considered mentioned constraint add a possibility of a shared section. The workflow for getting native eBPF running from tc [1] is as follows: for f_bpf, I've added a slightly modified ELF parser code from Alexei's kernel sample, which reads out the LLVM compiled object, sets up maps (and dynamically fixes up map fds) if any, and loads the eBPF instructions all centrally through the bpf syscall. The resulting fd from the loaded program itself is being passed down to cls_bpf, which looks up struct bpf_prog from the fd store, and holds reference, so that it stays available also after tc program lifetime. On tc filter destruction, it will then drop its reference. Moreover, I've also added the optional possibility to annotate an eBPF filter with a name (e.g. path to object file, or something else if preferred) so that when tc dumps currently installed filters, some more context can be given to an admin for a given instance (as opposed to just the file descriptor number). Last but not least, bpf_prog_get() and bpf_prog_put() needed to be exported, so that eBPF can be used from cls_bpf built as a module. Thanks to 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") I think this is of no concern since anything wanting to alter eBPF opcode after verification stage would crash the kernel. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf Signed-off-by: Daniel Borkmann Cc: Jamal Hadi Salim Cc: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 + kernel/bpf/syscall.c | 2 + net/sched/cls_bpf.c | 206 ++++++++++++++++++++++++++++++++----------- 3 files changed, 158 insertions(+), 52 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 25731dfb3fcc..bf08e76bf505 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,8 @@ enum { TCA_BPF_CLASSID, TCA_BPF_OPS_LEN, TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, __TCA_BPF_MAX, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d69449acbd0..669719ccc9ee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -419,6 +419,7 @@ void bpf_prog_put(struct bpf_prog *prog) bpf_prog_free(prog); } } +EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { @@ -466,6 +467,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) fdput(f); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD log_buf diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5f3ee9e4b5bf..6f7ed8f8e6ee 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include #include @@ -24,6 +26,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_DESCRIPTION("TC BPF based classifier"); +#define CLS_BPF_NAME_LEN 256 + struct cls_bpf_head { struct list_head plist; u32 hgen; @@ -32,18 +36,24 @@ struct cls_bpf_head { struct cls_bpf_prog { struct bpf_prog *filter; - struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct tcf_result res; struct list_head link; + struct tcf_result res; + struct tcf_exts exts; u32 handle; - u16 bpf_num_ops; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; + struct sock_filter *bpf_ops; + const char *bpf_name; struct tcf_proto *tp; struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FD] = { .type = NLA_U32 }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -76,6 +86,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } +static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) +{ + return !prog->bpf_ops; +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -94,8 +109,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); - bpf_prog_destroy(prog->filter); + if (cls_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); + kfree(prog->bpf_name); kfree(prog->bpf_ops); kfree(prog); } @@ -114,6 +133,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); + return 0; } @@ -151,69 +171,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, - struct cls_bpf_prog *prog, - unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) +static int cls_bpf_prog_from_ops(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) { struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; - u32 classid; int ret; - if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) - return -EINVAL; - - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); - if (ret < 0) - return ret; - - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); - if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) { - ret = -EINVAL; - goto errout; - } + if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) + return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); - if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { - ret = -EINVAL; - goto errout; - } + if (bpf_size != nla_len(tb[TCA_BPF_OPS])) + return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (bpf_ops == NULL) { - ret = -ENOMEM; - goto errout; - } + if (bpf_ops == NULL) + return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto errout_free; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - prog->bpf_num_ops = bpf_num_ops; prog->bpf_ops = bpf_ops; + prog->bpf_num_ops = bpf_num_ops; + prog->bpf_name = NULL; + prog->filter = fp; prog->res.classid = classid; + return 0; +} + +static int cls_bpf_prog_from_efd(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_BPF_NAME]), + nla_len(tb[TCA_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + prog->bpf_ops = NULL; + prog->bpf_fd = bpf_fd; + prog->bpf_name = name; + + prog->filter = fp; + prog->res.classid = classid; + + return 0; +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts exts; + bool is_bpf, is_ebpf; + u32 classid; + int ret; + + is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; + is_ebpf = tb[TCA_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : + cls_bpf_prog_from_efd(tb, prog, classid); + if (ret < 0) { + tcf_exts_destroy(&exts); + return ret; + } + tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); return 0; -errout_free: - kfree(bpf_ops); -errout: - tcf_exts_destroy(&exts); - return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -297,11 +369,43 @@ errout: return ret; } +static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *tm) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; - struct nlattr *nest, *nla; + struct nlattr *nest; + int ret; if (prog == NULL) return skb->len; @@ -314,16 +418,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) - goto nla_put_failure; - nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * - sizeof(struct sock_filter)); - if (nla == NULL) + if (cls_bpf_is_ebpf(prog)) + ret = cls_bpf_dump_ebpf_info(prog, skb); + else + ret = cls_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; -- cgit v1.2.3 From ffc1199122d83d60ad99f9c55df32feb650b7bff Mon Sep 17 00:00:00 2001 From: "Janusz.Dziedzic@tieto.com" Date: Sat, 21 Feb 2015 16:52:39 +0100 Subject: cfg80211: add VHT support for IBSS Add NL80211_EXT_FEATURE_VHT_IBSS flag and VHT support for IBSS. Signed-off-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2dcf9bba317c..8ee31f108407 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4328,11 +4328,13 @@ enum nl80211_feature_flags { /** * enum nl80211_ext_feature_index - bit index of extended features. + * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ enum nl80211_ext_feature_index { + NL80211_EXT_FEATURE_VHT_IBSS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9c6e23ede5b2..66666fdf1c8d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7265,8 +7265,18 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) break; case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: - if (rdev->wiphy.features & NL80211_FEATURE_HT_IBSS) - break; + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + break; + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_VHT_IBSS)) + return -EINVAL; + break; default: return -EINVAL; } -- cgit v1.2.3 From 5fc7432991a86678b38a2d700edbe8bcd29cc579 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Feb 2015 15:32:43 +0100 Subject: nl80211: add notes about userspace API/ABI modifications Add notes about userspace ABI/API modifications, including the fact that we decided that API submissions should come with a driver implementation. Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8ee31f108407..90c5aeb3cca7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -25,6 +25,19 @@ * */ +/* + * This header file defines the userspace API to the wireless stack. Please + * be careful not to break things - i.e. don't move anything around or so + * unless you can demonstrate that it breaks neither API nor ABI. + * + * Additions to the API should be accompanied by actual implementations in + * an upstream driver, so that example implementations exist in case there + * are ever concerns about the precise semantics of the API or changes are + * needed, and to ensure that code for dead (no longer implemented) API + * can actually be identified and removed. + * Nonetheless, semantics should also be documented carefully in this file. + */ + #include #define NL80211_GENL_NAME "nl80211" -- cgit v1.2.3 From 03c0566542f4c7a45ce3193f27cbf5700b506c18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Mar 2015 19:13:56 -0600 Subject: mpls: Netlink commands to add, remove, and dump routes This change adds two new netlink routing attributes: RTA_VIA and RTA_NEWDST. RTA_VIA specifies the specifies the next machine to send a packet to like RTA_GATEWAY. RTA_VIA differs from RTA_GATEWAY in that it includes the address family of the address of the next machine to send a packet to. Currently the MPLS code supports addresses in AF_INET, AF_INET6 and AF_PACKET. For AF_INET and AF_INET6 the destination mac address is acquired from the neighbour table. For AF_PACKET the destination mac_address is specified in the netlink configuration. I think raw destination mac address support with the family AF_PACKET will prove useful. There is MPLS-TP which is defined to operate on machines that do not support internet packets of any flavor. Further seem to be corner cases where it can be useful. At this point I don't care much either way. RTA_NEWDST specifies the destination address to forward the packet with. MPLS typically changes it's destination address at every hop. For a swap operation RTA_NEWDST is specified with a length of one label. For a push operation RTA_NEWDST is specified with two or more labels. For a pop operation RTA_NEWDST is not specified or equivalently an emtpy RTAN_NEWDST is specified. Those new netlink attributes are used to implement handling of rt-netlink RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE messages, to maintain the MPLS label table. rtm_to_route_config parses a netlink RTM_NEWROUTE or RTM_DELROUTE message, verify no unhandled attributes or unhandled values are present and sets up the data structures for mpls_route_add and mpls_route_del. I did my best to match up with the existing conventions with the caveats that MPLS addresses are all destination-specific-addresses, and so don't properly have a scope. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 8 ++ net/mpls/af_mpls.c | 229 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5cc5d66bf519..bad65550ae3e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -303,6 +303,8 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_VIA, + RTA_NEWDST, __RTA_MAX }; @@ -344,6 +346,12 @@ struct rtnexthop { #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) +/* RTA_VIA */ +struct rtvia { + __kernel_sa_family_t rtvia_family; + __u8 rtvia_addr[0]; +}; + /* RTM_CACHEINFO */ struct rta_cacheinfo { diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 2d6612a10e30..b4d7cec398d2 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -212,6 +212,11 @@ static struct packet_type mpls_packet_type __read_mostly = { .func = mpls_forward, }; +const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { + [RTA_DST] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + struct mpls_route_config { u32 rc_protocol; u32 rc_ifindex; @@ -410,6 +415,22 @@ static struct notifier_block mpls_dev_notifier = { .notifier_call = mpls_dev_notify, }; +static int nla_put_via(struct sk_buff *skb, + u16 family, const void *addr, int alen) +{ + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + return -EMSGSIZE; + + via = nla_data(nla); + via->rtvia_family = family; + memcpy(via->rtvia_addr, addr, alen); + return 0; +} + int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]) { @@ -467,6 +488,210 @@ int nla_get_labels(const struct nlattr *nla, return 0; } +static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct mpls_route_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int index; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + if (rtm->rtm_family != AF_MPLS) + goto errout; + if (rtm->rtm_dst_len != 20) + goto errout; + if (rtm->rtm_src_len != 0) + goto errout; + if (rtm->rtm_tos != 0) + goto errout; + if (rtm->rtm_table != RT_TABLE_MAIN) + goto errout; + /* Any value is acceptable for rtm_protocol */ + + /* As mpls uses destination specific addresses + * (or source specific address in the case of multicast) + * all addresses have universal scope. + */ + if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) + goto errout; + if (rtm->rtm_type != RTN_UNICAST) + goto errout; + if (rtm->rtm_flags != 0) + goto errout; + + cfg->rc_label = LABEL_NOT_SPECIFIED; + cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_nlflags = nlh->nlmsg_flags; + cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; + cfg->rc_nlinfo.nlh = nlh; + cfg->rc_nlinfo.nl_net = sock_net(skb->sk); + + for (index = 0; index <= RTA_MAX; index++) { + struct nlattr *nla = tb[index]; + if (!nla) + continue; + + switch(index) { + case RTA_OIF: + cfg->rc_ifindex = nla_get_u32(nla); + break; + case RTA_NEWDST: + if (nla_get_labels(nla, MAX_NEW_LABELS, + &cfg->rc_output_labels, + cfg->rc_output_label)) + goto errout; + break; + case RTA_DST: + { + u32 label_count; + if (nla_get_labels(nla, 1, &label_count, + &cfg->rc_label)) + goto errout; + + /* The first 16 labels are reserved, and may not be set */ + if (cfg->rc_label < 16) + goto errout; + + break; + } + case RTA_VIA: + { + struct rtvia *via = nla_data(nla); + cfg->rc_via_family = via->rtvia_family; + cfg->rc_via_alen = nla_len(nla) - 2; + if (cfg->rc_via_alen > MAX_VIA_ALEN) + goto errout; + + /* Validate the address family */ + switch(cfg->rc_via_family) { + case AF_PACKET: + break; + case AF_INET: + if (cfg->rc_via_alen != 4) + goto errout; + break; + case AF_INET6: + if (cfg->rc_via_alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); + break; + } + default: + /* Unsupported attribute */ + goto errout; + } + } + + err = 0; +errout: + return err; +} + +static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_del(&cfg); +} + + +static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_add(&cfg); +} + +static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, + u32 label, struct mpls_route *rt, int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_MPLS; + rtm->rtm_dst_len = 20; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = rt->rt_protocol; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + + if (rt->rt_labels && + nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) + goto nla_put_failure; + if (nla_put_via(skb, rt->rt_via_family, rt->rt_via, rt->rt_via_alen)) + goto nla_put_failure; + if (rt->rt_dev && nla_put_u32(skb, RTA_OIF, rt->rt_dev->ifindex)) + goto nla_put_failure; + if (nla_put_labels(skb, RTA_DST, 1, &label)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int index; + + ASSERT_RTNL(); + + index = cb->args[0]; + if (index < 16) + index = 16; + + for (; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt; + rt = net->mpls.platform_label[index]; + if (!rt) + continue; + + if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + index, rt, NLM_F_MULTI) < 0) + break; + } + cb->args[0] = index; + + return skb->len; +} + static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; @@ -662,6 +887,9 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); err = 0; out: return err; @@ -674,6 +902,7 @@ module_init(mpls_init); static void __exit mpls_exit(void) { + rtnl_unregister_all(PF_MPLS); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); -- cgit v1.2.3 From 8de147dc8e2adea82b8a1a2a08fcc983330f6770 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Mar 2015 19:14:31 -0600 Subject: mpls: Multicast route table change notifications Unlike IPv4 this code notifies on all cases where mpls routes are added or removed and it never automatically removes routes. Avoiding both the userspace confusion that is caused by omitting route updates and the possibility of a flood of netlink traffic when an interface goes doew. For now reserved labels are handled automatically and userspace is not notified. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/mpls/af_mpls.c | 60 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bad65550ae3e..06f75a407f74 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -631,6 +631,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF RTNLGRP_MDB, #define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b4d7cec398d2..75a994a50381 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -36,6 +36,10 @@ struct mpls_route { /* next hop label forwarding entry */ static int zero = 0; static int label_limit = (1 << 20) - 1; +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags); + static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) { struct mpls_route *rt = NULL; @@ -246,6 +250,20 @@ static void mpls_rt_free(struct mpls_route *rt) kfree_rcu(rt, rt_rcu); } +static void mpls_notify_route(struct net *net, unsigned index, + struct mpls_route *old, struct mpls_route *new, + const struct nl_info *info) +{ + struct nlmsghdr *nlh = info ? info->nlh : NULL; + unsigned portid = info ? info->portid : 0; + int event = new ? RTM_NEWROUTE : RTM_DELROUTE; + struct mpls_route *rt = new ? new : old; + unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; + /* Ignore reserved labels for now */ + if (rt && (index >= 16)) + rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); +} + static void mpls_route_update(struct net *net, unsigned index, struct net_device *dev, struct mpls_route *new, const struct nl_info *info) @@ -260,6 +278,8 @@ static void mpls_route_update(struct net *net, unsigned index, old = rt; } + mpls_notify_route(net, index, old, new, info); + /* If we removed a route free it now */ mpls_rt_free(old); } @@ -692,6 +712,46 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static inline size_t lfib_nlmsg_size(struct mpls_route *rt) +{ + size_t payload = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ + + nla_total_size(4); /* RTA_DST */ + if (rt->rt_labels) /* RTA_NEWDST */ + payload += nla_total_size(rt->rt_labels * 4); + if (rt->rt_dev) /* RTA_OIF */ + payload += nla_total_size(4); + return payload; +} + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags) +{ + struct sk_buff *skb; + u32 seq = nlh ? nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); + + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); +} + static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; -- cgit v1.2.3 From 98fc43864af9e74116eec81c290db048cded15d8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 1 Mar 2015 09:10:13 +0200 Subject: nl80211: prohibit mixing 'any' and regular wowlan triggers If the device supports waking up on 'any' signal - i.e. it continues operating as usual and wakes up the host on pretty much anything that happens, then it makes no sense to also configure the more restricted WoWLAN mode where the device operates more autonomously but also in a more restricted fashion. Currently only cw2100 supports both 'any' and other triggers, but it seems to be broken as it doesn't configure anything to the device, so we can't currently get into a situation where both even can correctly be configured. This is about to change (Intel devices are going to support both and have different behaviour depending on configuration) so make sure the conflicting modes cannot be configured. (It seems that cw2100 advertises 'any' and 'disconnect' as a means of saying that's what it will always do, but that isn't really the way this API was meant to be used nor does it actually mean anything as 'any' always implies 'disconnect' already, and the driver doesn't change device configuration in any way depending on the settings.) Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 90c5aeb3cca7..37e7f39441e5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3708,6 +3708,8 @@ struct nl80211_pattern_support { * @NL80211_WOWLAN_TRIG_ANY: wake up on any activity, do not really put * the chip into a special state -- works best with chips that have * support for low-power operation already (flag) + * Note that this mode is incompatible with all of the others, if + * any others are even supported by the device. * @NL80211_WOWLAN_TRIG_DISCONNECT: wake up on disconnect, the way disconnect * is detected is implementation-specific (flag) * @NL80211_WOWLAN_TRIG_MAGIC_PKT: wake up on magic packet (6x 0xff, followed diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 01874628ae00..07cef3d7653e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9105,6 +9105,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) const struct wiphy_wowlan_support *wowlan = rdev->wiphy.wowlan; int err, i; bool prev_enabled = rdev->wiphy.wowlan_config; + bool regular = false; if (!wowlan) return -EOPNOTSUPP; @@ -9132,12 +9133,14 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT)) return -EINVAL; new_triggers.disconnect = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) { if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT)) return -EINVAL; new_triggers.magic_pkt = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED]) @@ -9147,24 +9150,28 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE)) return -EINVAL; new_triggers.gtk_rekey_failure = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST]) { if (!(wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ)) return -EINVAL; new_triggers.eap_identity_req = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE]) { if (!(wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE)) return -EINVAL; new_triggers.four_way_handshake = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_RFKILL_RELEASE]) { if (!(wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE)) return -EINVAL; new_triggers.rfkill_release = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) { @@ -9173,6 +9180,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) int rem, pat_len, mask_len, pkt_offset; struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; + regular = true; + nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) n_patterns++; @@ -9234,6 +9243,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) { + regular = true; err = nl80211_parse_wowlan_tcp( rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION], &new_triggers); @@ -9242,6 +9252,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) { + regular = true; err = nl80211_parse_wowlan_nd( rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT], &new_triggers); @@ -9249,6 +9260,17 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto error; } + /* The 'any' trigger means the device continues operating more or less + * as in its normal operation mode and wakes up the host on most of the + * normal interrupts (like packet RX, ...) + * It therefore makes little sense to combine with the more constrained + * wakeup trigger modes. + */ + if (new_triggers.any && regular) { + err = -EINVAL; + goto error; + } + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL); if (!ntrig) { err = -ENOMEM; -- cgit v1.2.3 From 842a9ae08a25671db3d4f689eed68b4d64be15b5 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 4 Mar 2015 12:54:21 +0200 Subject: bridge: Extend Proxy ARP design to allow optional rules for Wi-Fi This extends the design in commit 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP") with optional set of rules that are needed to meet the IEEE 802.11 and Hotspot 2.0 requirements for ProxyARP. The previously added BR_PROXYARP behavior is left as-is and a new BR_PROXYARP_WIFI alternative is added so that this behavior can be configured from user space when required. In addition, this enables proxyarp functionality for unicast ARP requests for both BR_PROXYARP and BR_PROXYARP_WIFI since it is possible to use unicast as well as broadcast for these frames. The key differences in functionality: BR_PROXYARP: - uses the flag on the bridge port on which the request frame was received to determine whether to reply - block bridge port flooding completely on ports that enable proxy ARP BR_PROXYARP_WIFI: - uses the flag on the bridge port to which the target device of the request belongs - block bridge port flooding selectively based on whether the proxyarp functionality replied Signed-off-by: Jouni Malinen Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 3 +++ net/bridge/br_input.c | 17 ++++++++++------- net/bridge/br_netlink.c | 5 ++++- net/bridge/br_private.h | 1 + net/bridge/br_sysfs_if.c | 2 ++ 7 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index a57bca2ea97e..dad8b00beed2 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -44,6 +44,7 @@ struct br_ip_list { #define BR_PROMISC BIT(7) #define BR_PROXYARP BIT(8) #define BR_LEARNING_SYNC BIT(9) +#define BR_PROXYARP_WIFI BIT(10) extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index dfd0bb22e554..756436e1ce89 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -247,6 +247,7 @@ enum { IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ IFLA_BRPORT_PROXYARP, /* proxy ARP */ IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index f96933a823e3..1238fabff874 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -188,6 +188,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; + if ((p->flags & BR_PROXYARP_WIFI) && + BR_INPUT_SKB_CB(skb)->proxyarp_replied) + continue; prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e2aa7be3a847..052c5ebbc947 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -60,7 +60,7 @@ static int br_pass_frame_up(struct sk_buff *skb) } static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid) + u16 vid, struct net_bridge_port *p) { struct net_device *dev = br->dev; struct neighbour *n; @@ -68,6 +68,8 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, u8 *arpptr, *sha; __be32 sip, tip; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + if (dev->flags & IFF_NOARP) return; @@ -105,9 +107,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } f = __br_fdb_get(br, n->ha, vid); - if (f) + if (f && ((p->flags & BR_PROXYARP) || + (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, sha, n->ha, sha); + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } neigh_release(n); } @@ -153,12 +158,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_broadcast_ether_addr(dest)) { - if (IS_ENABLED(CONFIG_INET) && - p->flags & BR_PROXYARP && - skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid); + if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) + br_do_proxy_arp(skb, br, vid, p); + if (is_broadcast_ether_addr(dest)) { skb2 = skb; unicast = false; } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c72083968768..8bc6b67457dc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -143,7 +143,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || - nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP))) + nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || + nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, + !!(p->flags & BR_PROXYARP_WIFI))) return -EMSGSIZE; return 0; @@ -553,6 +555,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index de0919975a25..c32e279c62f8 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -305,6 +305,7 @@ struct br_input_skb_cb { #endif u16 frag_max_size; + bool proxyarp_replied; #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 2de5d91199e8..4905845a94e9 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); +BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -215,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_fast_leave, #endif &brport_attr_proxyarp, + &brport_attr_proxyarp_wifi, NULL }; -- cgit v1.2.3 From 1cae565e8b746f484f1ff1b71d2a1c89d7cf0668 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Mar 2015 15:05:36 +0100 Subject: netfilter: nf_tables: limit maximum table name length to 32 bytes Set the same as we use for chain names, it should be enough. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 7 ++++--- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 04188b47629d..a143acafa5d9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -535,7 +535,7 @@ struct nft_table { u64 hgenerator; u32 use; u16 flags; - char name[]; + char name[NFT_TABLE_MAXNAMELEN]; }; /** diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 832bc46db78b..b9783931503b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,6 +1,7 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H +#define NFT_TABLE_MAXNAMELEN 32 #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 199fd0f27b0e..284b20ce566b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -401,7 +401,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { - [NFTA_TABLE_NAME] = { .type = NLA_STRING }, + [NFTA_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; @@ -686,13 +687,13 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (!try_module_get(afi->owner)) return -EAFNOSUPPORT; - table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); + table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) { module_put(afi->owner); return -ENOMEM; } - nla_strlcpy(table->name, name, nla_len(name)); + nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; -- cgit v1.2.3 From d0f91938bede204a343473792529e0db7d599836 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 5 Mar 2015 10:23:49 +0100 Subject: tipc: add ip/udp media type The ip/udp bearer can be configured in a point-to-point mode by specifying both local and remote ip/hostname, or it can be enabled in multicast mode, where links are established to all tipc nodes that have joined the same multicast group. The multicast IP address is generated based on the TIPC network ID, but can be overridden by using another multicast address as remote ip. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 9 + net/tipc/Kconfig | 8 + net/tipc/Makefile | 1 + net/tipc/bearer.c | 13 +- net/tipc/bearer.h | 12 +- net/tipc/msg.h | 2 +- net/tipc/udp_media.c | 442 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 479 insertions(+), 8 deletions(-) create mode 100644 net/tipc/udp_media.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 8d723824ad69..d4c8f142ba63 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -83,11 +83,20 @@ enum { TIPC_NLA_BEARER_NAME, /* string */ TIPC_NLA_BEARER_PROP, /* nest */ TIPC_NLA_BEARER_DOMAIN, /* u32 */ + TIPC_NLA_BEARER_UDP_OPTS, /* nest */ __TIPC_NLA_BEARER_MAX, TIPC_NLA_BEARER_MAX = __TIPC_NLA_BEARER_MAX - 1 }; +enum { + TIPC_NLA_UDP_UNSPEC, + TIPC_NLA_UDP_LOCAL, /* sockaddr_storage */ + TIPC_NLA_UDP_REMOTE, /* sockaddr_storage */ + + __TIPC_NLA_UDP_MAX, + TIPC_NLA_UDP_MAX = __TIPC_NLA_UDP_MAX - 1 +}; /* Socket info */ enum { TIPC_NLA_SOCK_UNSPEC, diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index 91c8a8e031db..c25a3a149dc4 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -26,3 +26,11 @@ config TIPC_MEDIA_IB help Saying Y here will enable support for running TIPC on IP-over-InfiniBand devices. +config TIPC_MEDIA_UDP + bool "IP/UDP media type support" + depends on TIPC + select NET_UDP_TUNNEL + help + Saying Y here will enable support for running TIPC over IP/UDP + bool + default y diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 599b1a540d2b..57e460be4692 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -10,5 +10,6 @@ tipc-y += addr.o bcast.o bearer.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o +tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index af6deeb397a8..840db89e4283 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -47,6 +47,9 @@ static struct tipc_media * const media_info_array[] = { ð_media_info, #ifdef CONFIG_TIPC_MEDIA_IB &ib_media_info, +#endif +#ifdef CONFIG_TIPC_MEDIA_UDP + &udp_media_info, #endif NULL }; @@ -216,7 +219,8 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * tipc_enable_bearer - enable bearer with the given name */ static int tipc_enable_bearer(struct net *net, const char *name, - u32 disc_domain, u32 priority) + u32 disc_domain, u32 priority, + struct nlattr *attr[]) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b_ptr; @@ -304,7 +308,7 @@ restart: strcpy(b_ptr->name, name); b_ptr->media = m_ptr; - res = m_ptr->enable_media(net, b_ptr); + res = m_ptr->enable_media(net, b_ptr, attr); if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); @@ -372,7 +376,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, kfree_rcu(b_ptr, rcu); } -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b) +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attr[]) { struct net_device *dev; char *driver_name = strchr((const char *)b->name, ':') + 1; @@ -791,7 +796,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) } rtnl_lock(); - err = tipc_enable_bearer(net, bearer, domain, prio); + err = tipc_enable_bearer(net, bearer, domain, prio, attrs); if (err) { rtnl_unlock(); return err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 097aff08ad5b..5cad243ee8fc 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -41,7 +41,7 @@ #include #define MAX_BEARERS 2 -#define MAX_MEDIA 2 +#define MAX_MEDIA 3 #define MAX_NODES 4096 #define WSIZE 32 @@ -59,6 +59,7 @@ */ #define TIPC_MEDIA_TYPE_ETH 1 #define TIPC_MEDIA_TYPE_IB 2 +#define TIPC_MEDIA_TYPE_UDP 3 /** * struct tipc_node_map - set of node identifiers @@ -104,7 +105,8 @@ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest); - int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr); + int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr, + struct nlattr *attr[]); void (*disable_media)(struct tipc_bearer *b_ptr); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, @@ -183,6 +185,9 @@ extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB extern struct tipc_media ib_media_info; #endif +#ifdef CONFIG_TIPC_MEDIA_UDP +extern struct tipc_media udp_media_info; +#endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); @@ -197,7 +202,8 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b); +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index c1cc8d7a5d52..fa167846d1ab 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -87,7 +87,7 @@ struct plist; * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields * are word aligned for quicker access */ -#define BUF_HEADROOM LL_MAX_HEADER +#define BUF_HEADROOM (LL_MAX_HEADER + 48) struct tipc_skb_cb { void *handle; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c new file mode 100644 index 000000000000..0d10001db40d --- /dev/null +++ b/net/tipc/udp_media.c @@ -0,0 +1,442 @@ +/* net/tipc/udp_media.c: IP bearer support for TIPC + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "bearer.h" + +/* IANA assigned UDP port */ +#define UDP_PORT_DEFAULT 6118 + +static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + +/** + * struct udp_media_addr - IP/UDP addressing information + * + * This is the bearer level originating address used in neighbor discovery + * messages, and all fields should be in network byte order + */ +struct udp_media_addr { + __be16 proto; + __be16 udp_port; + union { + struct in_addr ipv4; + struct in6_addr ipv6; + }; +}; + +/** + * struct udp_bearer - ip/udp bearer data structure + * @bearer: associated generic tipc bearer + * @ubsock: bearer associated socket + * @ifindex: local address scope + * @work: used to schedule deferred work on a bearer + */ +struct udp_bearer { + struct tipc_bearer __rcu *bearer; + struct socket *ubsock; + u32 ifindex; + struct work_struct work; +}; + +/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ +static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, + struct udp_media_addr *ua) +{ + memset(addr, 0, sizeof(struct tipc_media_addr)); + addr->media_id = TIPC_MEDIA_TYPE_UDP; + memcpy(addr->value, ua, sizeof(struct udp_media_addr)); + if (ntohs(ua->proto) == ETH_P_IP) { + if (ipv4_is_multicast(ua->ipv4.s_addr)) + addr->broadcast = 1; + } else if (ntohs(ua->proto) == ETH_P_IPV6) { + if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST) + addr->broadcast = 1; + } else { + pr_err("Invalid UDP media address\n"); + } +} + +/* tipc_udp_addr2str - convert ip/udp address to string */ +static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) +{ + struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; + + if (ntohs(ua->proto) == ETH_P_IP) + snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port)); + else if (ntohs(ua->proto) == ETH_P_IPV6) + snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port)); + else + pr_err("Invalid UDP media address\n"); + return 0; +} + +/* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ +static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, + char *msg) +{ + struct udp_media_addr *ua; + + ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); + if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; + tipc_udp_media_addr_set(a, ua); + return 0; +} + +/* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ +static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) +{ + memset(msg, 0, TIPC_MEDIA_INFO_SIZE); + msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; + memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, + sizeof(struct udp_media_addr)); + return 0; +} + +/* tipc_send_msg - enqueue a send request */ +static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dest) +{ + int ttl, err = 0; + struct udp_bearer *ub; + struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct sk_buff *clone; + struct rtable *rt; + + clone = skb_clone(skb, GFP_ATOMIC); + skb_set_inner_protocol(clone, htons(ETH_P_TIPC)); + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + err = -ENODEV; + goto tx_error; + } + if (htons(dst->proto) == ETH_P_IP) { + struct flowi4 fl = { + .daddr = dst->ipv4.s_addr, + .saddr = src->ipv4.s_addr, + .flowi4_mark = clone->mark, + .flowi4_proto = IPPROTO_UDP + }; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto tx_error; + } + ttl = ip4_dst_hoplimit(&rt->dst); + err = udp_tunnel_xmit_skb(rt, clone, src->ipv4.s_addr, + dst->ipv4.s_addr, 0, ttl, 0, + src->udp_port, dst->udp_port, + false, true); + if (err < 0) { + ip_rt_put(rt); + goto tx_error; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct dst_entry *ndst; + struct flowi6 fl6 = { + .flowi6_oif = ub->ifindex, + .daddr = dst->ipv6, + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; + err = ip6_dst_lookup(ub->ubsock->sk, &ndst, &fl6); + if (err) + goto tx_error; + ttl = ip6_dst_hoplimit(ndst); + err = udp_tunnel6_xmit_skb(ndst, clone, ndst->dev, &src->ipv6, + &dst->ipv6, 0, ttl, src->udp_port, + dst->udp_port, false); +#endif + } + return err; + +tx_error: + kfree_skb(clone); + return err; +} + +/* tipc_udp_recv - read data from bearer socket */ +static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_bearer *ub; + struct tipc_bearer *b; + + ub = rcu_dereference_sk_user_data(sk); + if (!ub) { + pr_err_ratelimited("Failed to get UDP bearer reference"); + kfree_skb(skb); + return 0; + } + + skb_pull(skb, sizeof(struct udphdr)); + rcu_read_lock(); + b = rcu_dereference_rtnl(ub->bearer); + + if (b) { + tipc_rcv(sock_net(sk), skb, b); + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + +static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) +{ + int err = 0; + struct ip_mreqn mreqn; + struct sock *sk = ub->ubsock->sk; + + if (ntohs(remote->proto) == ETH_P_IP) { + if (!ipv4_is_multicast(remote->ipv4.s_addr)) + return 0; + mreqn.imr_multiaddr = remote->ipv4; + mreqn.imr_ifindex = ub->ifindex; + err = __ip_mc_join_group(sk, &mreqn); + } else { + if (!ipv6_addr_is_multicast(&remote->ipv6)) + return 0; + err = __ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); + } + return err; +} + +/** + * parse_options - build local/remote addresses from configuration + * @attrs: netlink config data + * @ub: UDP bearer instance + * @local: local bearer IP address/port + * @remote: peer or multicast IP/port + */ +static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, + struct udp_media_addr *local, + struct udp_media_addr *remote) +{ + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct sockaddr_storage *sa_local, *sa_remote; + + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) + goto err; + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, + attrs[TIPC_NLA_BEARER_UDP_OPTS], + tipc_nl_udp_policy)) + goto err; + if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { + sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); + sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + } else { +err: + pr_err("Invalid UDP bearer configuration"); + return -EINVAL; + } + if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + struct sockaddr_in *ip4; + + ip4 = (struct sockaddr_in *)sa_local; + local->proto = htons(ETH_P_IP); + local->udp_port = ip4->sin_port; + local->ipv4.s_addr = ip4->sin_addr.s_addr; + + ip4 = (struct sockaddr_in *)sa_remote; + remote->proto = htons(ETH_P_IP); + remote->udp_port = ip4->sin_port; + remote->ipv4.s_addr = ip4->sin_addr.s_addr; + return 0; + +#if IS_ENABLED(CONFIG_IPV6) + } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + struct sockaddr_in6 *ip6; + + ip6 = (struct sockaddr_in6 *)sa_local; + local->proto = htons(ETH_P_IPV6); + local->udp_port = ip6->sin6_port; + local->ipv6 = ip6->sin6_addr; + ub->ifindex = ip6->sin6_scope_id; + + ip6 = (struct sockaddr_in6 *)sa_remote; + remote->proto = htons(ETH_P_IPV6); + remote->udp_port = ip6->sin6_port; + remote->ipv6 = ip6->sin6_addr; + return 0; +#endif + } + return -EADDRNOTAVAIL; +} + +/** + * tipc_udp_enable - callback to create a new udp bearer instance + * @net: network namespace + * @b: pointer to generic tipc_bearer + * @attrs: netlink bearer configuration + * + * validate the bearer parameters and initialize the udp bearer + * rtnl_lock should be held + */ +static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]) +{ + int err = -EINVAL; + struct udp_bearer *ub; + struct udp_media_addr *remote; + struct udp_media_addr local = {0}; + struct udp_port_cfg udp_conf = {0}; + struct udp_tunnel_sock_cfg tuncfg = {0}; + + ub = kzalloc(sizeof(*ub), GFP_ATOMIC); + if (!ub) + return -ENOMEM; + + remote = (struct udp_media_addr *)&b->bcast_addr.value; + memset(remote, 0, sizeof(struct udp_media_addr)); + err = parse_options(attrs, ub, &local, remote); + if (err) + goto err; + + b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; + b->bcast_addr.broadcast = 1; + rcu_assign_pointer(b->media_ptr, ub); + rcu_assign_pointer(ub->bearer, b); + tipc_udp_media_addr_set(&b->addr, &local); + if (htons(local.proto) == ETH_P_IP) { + struct net_device *dev; + + dev = __ip_dev_find(net, local.ipv4.s_addr, false); + if (!dev) { + err = -ENODEV; + goto err; + } + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.use_udp_checksums = false; + ub->ifindex = dev->ifindex; + b->mtu = dev->mtu - sizeof(struct iphdr) + - sizeof(struct udphdr); +#if IS_ENABLED(CONFIG_IPV6) + } else if (htons(local.proto) == ETH_P_IPV6) { + udp_conf.family = AF_INET6; + udp_conf.use_udp6_tx_checksums = true; + udp_conf.use_udp6_rx_checksums = true; + udp_conf.local_ip6 = in6addr_any; + b->mtu = 1280; +#endif + } else { + err = -EAFNOSUPPORT; + goto err; + } + udp_conf.local_udp_port = local.udp_port; + err = udp_sock_create(net, &udp_conf, &ub->ubsock); + if (err) + goto err; + tuncfg.sk_user_data = ub; + tuncfg.encap_type = 1; + tuncfg.encap_rcv = tipc_udp_recv; + tuncfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); + + if (enable_mcast(ub, remote)) + goto err; + return 0; +err: + kfree(ub); + return err; +} + +/* cleanup_bearer - break the socket/bearer association */ +static void cleanup_bearer(struct work_struct *work) +{ + struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + + if (ub->ubsock) + udp_tunnel_sock_release(ub->ubsock); + synchronize_net(); + kfree(ub); +} + +/* tipc_udp_disable - detach bearer from socket */ +static void tipc_udp_disable(struct tipc_bearer *b) +{ + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + pr_err("UDP bearer instance not found\n"); + return; + } + if (ub->ubsock) + sock_set_flag(ub->ubsock->sk, SOCK_DEAD); + RCU_INIT_POINTER(b->media_ptr, NULL); + RCU_INIT_POINTER(ub->bearer, NULL); + + /* sock_release need to be done outside of rtnl lock */ + INIT_WORK(&ub->work, cleanup_bearer); + schedule_work(&ub->work); +} + +struct tipc_media udp_media_info = { + .send_msg = tipc_udp_send_msg, + .enable_media = tipc_udp_enable, + .disable_media = tipc_udp_disable, + .addr2str = tipc_udp_addr2str, + .addr2msg = tipc_udp_addr2msg, + .msg2addr = tipc_udp_msg2addr, + .priority = TIPC_DEF_LINK_PRI, + .tolerance = TIPC_DEF_LINK_TOL, + .window = TIPC_DEF_LINK_WIN, + .type_id = TIPC_MEDIA_TYPE_UDP, + .hwaddr_len = 0, + .name = "udp" +}; -- cgit v1.2.3 From 37ed9493699cc5dafe1b8725858ef73176fdc9d2 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 5 Mar 2015 21:21:13 -0800 Subject: rtnetlink: add RTNH_F_EXTERNAL flag for fib offload Add new RTNH_F_EXTERNAL flag to mark fib entries offloaded externally, for example to a switchdev switch device. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 06f75a407f74..c3722b024e73 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -334,6 +334,7 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_EXTERNAL 8 /* Route installed externally */ /* Macros to handle hexthops */ -- cgit v1.2.3 From 05050753602626ed4c46271c689929b625f409e7 Mon Sep 17 00:00:00 2001 From: Ilan peer Date: Wed, 4 Mar 2015 00:32:06 -0500 Subject: cfg80211: Add API to change the indoor regulatory setting Previously, the indoor setting configuration assumed that as long as a station interface is connected, the indoor environment setting does not change. However, this assumption is problematic as: - It is possible that a station interface is connected to a mobile AP, e.g., softAP or a P2P GO, where it is possible that both the station and the mobile AP move out of the indoor environment making the indoor setting invalid. In such a case, user space has no way to invalidate the setting. - A station interface disconnection does not necessarily imply that the device is no longer operating in an indoor environment, e.g., it is possible that the station interface is roaming but is still stays indoor. To handle the above, extend the indoor configuration API to allow user space to indicate a change of indoor settings, and allow it to indicate weather it controls the indoor setting, such that: 1. If the user space process explicitly indicates that it is going to control the indoor setting, do not clear the indoor setting internally, unless the socket is released. The user space process should use the NL80211_ATTR_SOCKET_OWNER attribute in the command to state that it is going to control the indoor setting. 2. Reset the indoor setting when restoring the regulatory settings in case it is not owned by a user space process. Based on the above, a user space tool that continuously monitors the indoor settings, i.e., tracking power setting, location etc., can indicate environment changes to the regulatory core. It should be noted that currently user space is the only provided mechanism used to hint to the regulatory core over the indoor/outdoor environment -- while the country IEs do have an environment setting this has been completely ignored by the regulatory core by design for a while now since country IEs typically can contain bogus data. Acked-by: Luis R. Rodriguez Signed-off-by: ArikX Nemtsov Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 9 +++++++ net/wireless/nl80211.c | 19 ++++++++++++++- net/wireless/reg.c | 57 ++++++++++++++++++++++++++++++++++++++++---- net/wireless/reg.h | 15 +++++++++++- 4 files changed, 94 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 37e7f39441e5..ae16ba9cb1e3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1697,6 +1697,10 @@ enum nl80211_commands { * If set during scheduled scan start then the new scan req will be * owned by the netlink socket that created it and the scheduled scan will * be stopped when the socket is closed. + * If set during configuration of regulatory indoor operation then the + * regulatory indoor configuration would be owned by the netlink socket + * that configured the indoor setting, and the indoor operation would be + * cleared when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. @@ -1752,6 +1756,9 @@ enum nl80211_commands { * * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before a scheduled scan (or a * WoWLAN net-detect scan) is started, u32 in seconds. + + * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device + * is operating in an indoor environment. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2120,6 +2127,8 @@ enum nl80211_attrs { NL80211_ATTR_SCHED_SCAN_DELAY, + NL80211_ATTR_REG_INDOOR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 07cef3d7653e..b02085301785 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -399,6 +399,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, + [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4958,7 +4959,10 @@ static int parse_reg_rule(struct nlattr *tb[], static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) { char *data = NULL; + bool is_indoor; enum nl80211_user_reg_hint_type user_reg_hint_type; + u32 owner_nlportid; + /* * You should only get this when cfg80211 hasn't yet initialized @@ -4984,7 +4988,15 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); return regulatory_hint_user(data, user_reg_hint_type); case NL80211_USER_REG_HINT_INDOOR: - return regulatory_hint_indoor_user(); + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + owner_nlportid = info->snd_portid; + is_indoor = !!info->attrs[NL80211_ATTR_REG_INDOOR]; + } else { + owner_nlportid = 0; + is_indoor = true; + } + + return regulatory_hint_indoor(is_indoor, owner_nlportid); default: return -EINVAL; } @@ -12810,6 +12822,11 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_unlock(); + /* + * It is possible that the user space process that is controlling the + * indoor setting disappeared, so notify the regulatory core. + */ + regulatory_netlink_notify(notify->portid); return NOTIFY_OK; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c24c8bf3c988..4239dd408137 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -128,9 +128,12 @@ static int reg_num_devs_support_basehint; * State variable indicating if the platform on which the devices * are attached is operating in an indoor environment. The state variable * is relevant for all registered devices. - * (protected by RTNL) */ static bool reg_is_indoor; +static spinlock_t reg_indoor_lock; + +/* Used to track the userspace process controlling the indoor setting */ +static u32 reg_is_indoor_portid; static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { @@ -2288,15 +2291,50 @@ int regulatory_hint_user(const char *alpha2, return 0; } -int regulatory_hint_indoor_user(void) +int regulatory_hint_indoor(bool is_indoor, u32 portid) { + spin_lock(®_indoor_lock); + + /* It is possible that more than one user space process is trying to + * configure the indoor setting. To handle such cases, clear the indoor + * setting in case that some process does not think that the device + * is operating in an indoor environment. In addition, if a user space + * process indicates that it is controlling the indoor setting, save its + * portid, i.e., make it the owner. + */ + reg_is_indoor = is_indoor; + if (reg_is_indoor) { + if (!reg_is_indoor_portid) + reg_is_indoor_portid = portid; + } else { + reg_is_indoor_portid = 0; + } + spin_unlock(®_indoor_lock); - reg_is_indoor = true; + if (!is_indoor) + reg_check_channels(); return 0; } +void regulatory_netlink_notify(u32 portid) +{ + spin_lock(®_indoor_lock); + + if (reg_is_indoor_portid != portid) { + spin_unlock(®_indoor_lock); + return; + } + + reg_is_indoor = false; + reg_is_indoor_portid = 0; + + spin_unlock(®_indoor_lock); + + reg_check_channels(); +} + /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { @@ -2464,7 +2502,17 @@ static void restore_regulatory_settings(bool reset_user) ASSERT_RTNL(); - reg_is_indoor = false; + /* + * Clear the indoor setting in case that it is not controlled by user + * space, as otherwise there is no guarantee that the device is still + * operating in an indoor environment. + */ + spin_lock(®_indoor_lock); + if (reg_is_indoor && !reg_is_indoor_portid) { + reg_is_indoor = false; + reg_check_channels(); + } + spin_unlock(®_indoor_lock); reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); @@ -3061,6 +3109,7 @@ int __init regulatory_init(void) spin_lock_init(®_requests_lock); spin_lock_init(®_pending_beacons_lock); + spin_lock_init(®_indoor_lock); reg_regdb_size_check(); diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 4b45d6e61d24..a2c4e16459da 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -25,7 +25,20 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy); int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type); -int regulatory_hint_indoor_user(void); + +/** + * regulatory_hint_indoor - hint operation in indoor env. or not + * @is_indoor: if true indicates that user space thinks that the + * device is operating in an indoor environment. + * @portid: the netlink port ID on which the hint was given. + */ +int regulatory_hint_indoor(bool is_indoor, u32 portid); + +/** + * regulatory_netlink_notify - notify on released netlink socket + * @portid: the netlink socket port ID + */ +void regulatory_netlink_notify(u32 portid); void wiphy_regulatory_register(struct wiphy *wiphy); void wiphy_regulatory_deregister(struct wiphy *wiphy); -- cgit v1.2.3 From c93682477bd861744589215515a63b81fdbd8948 Mon Sep 17 00:00:00 2001 From: Shani Michaeli Date: Thu, 5 Mar 2015 20:16:11 +0200 Subject: net/dcb: Add IEEE QCN attribute As specified in 802.1Qau spec. Add this optional attribute to the DCB netlink layer. To allow for application to use the new attribute, NIC drivers should implement and register the callbacks ieee_getqcn, ieee_setqcn and ieee_getqcnstats. The QCN attribute holds a set of parameters for management, and a set of statistics to provide informative data on Congestion-Control defined by this spec. Signed-off-by: Shani Michaeli Signed-off-by: Shachar Raindel Signed-off-by: Or Gerlitz Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/dcbnl.h | 3 +++ include/uapi/linux/dcbnl.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ net/dcb/dcbnl.c | 44 ++++++++++++++++++++++++++++--- 3 files changed, 110 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 597b88a94332..207d9ba1f92c 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -49,6 +49,9 @@ struct dcbnl_rtnl_ops { int (*ieee_setets) (struct net_device *, struct ieee_ets *); int (*ieee_getmaxrate) (struct net_device *, struct ieee_maxrate *); int (*ieee_setmaxrate) (struct net_device *, struct ieee_maxrate *); + int (*ieee_getqcn) (struct net_device *, struct ieee_qcn *); + int (*ieee_setqcn) (struct net_device *, struct ieee_qcn *); + int (*ieee_getqcnstats) (struct net_device *, struct ieee_qcn_stats *); int (*ieee_getpfc) (struct net_device *, struct ieee_pfc *); int (*ieee_setpfc) (struct net_device *, struct ieee_pfc *); int (*ieee_getapp) (struct net_device *, struct dcb_app *); diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index e711f20dc522..6497d7933d5b 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -78,6 +78,70 @@ struct ieee_maxrate { __u64 tc_maxrate[IEEE_8021QAZ_MAX_TCS]; }; +enum dcbnl_cndd_states { + DCB_CNDD_RESET = 0, + DCB_CNDD_EDGE, + DCB_CNDD_INTERIOR, + DCB_CNDD_INTERIOR_READY, +}; + +/* This structure contains the IEEE 802.1Qau QCN managed object. + * + *@rpg_enable: enable QCN RP + *@rppp_max_rps: maximum number of RPs allowed for this CNPV on this port + *@rpg_time_reset: time between rate increases if no CNMs received. + * given in u-seconds + *@rpg_byte_reset: transmitted data between rate increases if no CNMs received. + * given in Bytes + *@rpg_threshold: The number of times rpByteStage or rpTimeStage can count + * before RP rate control state machine advances states + *@rpg_max_rate: the maxinun rate, in Mbits per second, + * at which an RP can transmit + *@rpg_ai_rate: The rate, in Mbits per second, + * used to increase rpTargetRate in the RPR_ACTIVE_INCREASE + *@rpg_hai_rate: The rate, in Mbits per second, + * used to increase rpTargetRate in the RPR_HYPER_INCREASE state + *@rpg_gd: Upon CNM receive, flow rate is limited to (Fb/Gd)*CurrentRate. + * rpgGd is given as log2(Gd), where Gd may only be powers of 2 + *@rpg_min_dec_fac: The minimum factor by which the current transmit rate + * can be changed by reception of a CNM. + * value is given as percentage (1-100) + *@rpg_min_rate: The minimum value, in bits per second, for rate to limit + *@cndd_state_machine: The state of the congestion notification domain + * defense state machine, as defined by IEEE 802.3Qau + * section 32.1.1. In the interior ready state, + * the QCN capable hardware may add CN-TAG TLV to the + * outgoing traffic, to specifically identify outgoing + * flows. + */ + +struct ieee_qcn { + __u8 rpg_enable[IEEE_8021QAZ_MAX_TCS]; + __u32 rppp_max_rps[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_time_reset[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_byte_reset[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_threshold[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_max_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_ai_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_hai_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_gd[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_min_dec_fac[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_min_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 cndd_state_machine[IEEE_8021QAZ_MAX_TCS]; +}; + +/* This structure contains the IEEE 802.1Qau QCN statistics. + * + *@rppp_rp_centiseconds: the number of RP-centiseconds accumulated + * by RPs at this priority level on this Port + *@rppp_created_rps: number of active RPs(flows) that react to CNMs + */ + +struct ieee_qcn_stats { + __u64 rppp_rp_centiseconds[IEEE_8021QAZ_MAX_TCS]; + __u32 rppp_created_rps[IEEE_8021QAZ_MAX_TCS]; +}; + /* This structure contains the IEEE 802.1Qaz PFC managed object * * @pfc_cap: Indicates the number of traffic classes on the local device @@ -334,6 +398,8 @@ enum ieee_attrs { DCB_ATTR_IEEE_PEER_PFC, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_MAXRATE, + DCB_ATTR_IEEE_QCN, + DCB_ATTR_IEEE_QCN_STATS, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 93ea80196f0e..5b21f6f88e97 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -177,6 +177,8 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, + [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, + [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, }; static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { @@ -1030,7 +1032,7 @@ nla_put_failure: return err; } -/* Handle IEEE 802.1Qaz GET commands. */ +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *ieee, *app; @@ -1067,6 +1069,32 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) } } + if (ops->ieee_getqcn) { + struct ieee_qcn qcn; + + memset(&qcn, 0, sizeof(qcn)); + err = ops->ieee_getqcn(netdev, &qcn); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN, + sizeof(qcn), &qcn); + if (err) + return -EMSGSIZE; + } + } + + if (ops->ieee_getqcnstats) { + struct ieee_qcn_stats qcn_stats; + + memset(&qcn_stats, 0, sizeof(qcn_stats)); + err = ops->ieee_getqcnstats(netdev, &qcn_stats); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, + sizeof(qcn_stats), &qcn_stats); + if (err) + return -EMSGSIZE; + } + } + if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); @@ -1379,8 +1407,9 @@ int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, } EXPORT_SYMBOL(dcbnl_cee_notify); -/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not - * be completed the entire msg is aborted and error value is returned. +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. + * If any requested operation can not be completed + * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ @@ -1417,6 +1446,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } + if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { + struct ieee_qcn *qcn = + nla_data(ieee[DCB_ATTR_IEEE_QCN]); + + err = ops->ieee_setqcn(netdev, qcn); + if (err) + goto err; + } + if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); -- cgit v1.2.3 From c78ba6d64c78634a875d1e316676667cabfea256 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 11 Mar 2015 15:39:21 +0100 Subject: ipv6: expose RFC4191 route preference via rtnetlink This makes it possible to retain the route preference when RAs are handled in userspace. Signed-off-by: Lubomir Rintel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/ipv6/route.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index c3722b024e73..bea910f924dd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -305,6 +305,7 @@ enum rtattr_type_t { RTA_MFC_STATS, RTA_VIA, RTA_NEWDST, + RTA_PREF, __RTA_MAX }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 06fa819c43c9..58c0e6a4d15d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2398,6 +2398,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, + [RTA_PREF] = { .type = NLA_U8 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2405,6 +2406,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + unsigned int pref; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); @@ -2480,6 +2482,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); } + if (tb[RTA_PREF]) { + pref = nla_get_u8(tb[RTA_PREF]); + if (pref != ICMPV6_ROUTER_PREF_LOW && + pref != ICMPV6_ROUTER_PREF_HIGH) + pref = ICMPV6_ROUTER_PREF_MEDIUM; + cfg->fc_flags |= RTF_PREF(pref); + } + err = 0; errout: return err; @@ -2583,7 +2593,8 @@ static inline size_t rt6_nlmsg_size(void) + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) - + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + + nla_total_size(1); /* RTA_PREF */ } static int rt6_fill_node(struct net *net, @@ -2724,6 +2735,9 @@ static int rt6_fill_node(struct net *net, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 03e69b508b6f7c51743055c9f61d1dfeadf4b635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:16 +0100 Subject: ebpf: add prandom helper for packet sampling This work is similar to commit 4cd3675ebf74 ("filter: added BPF random opcode") and adds a possibility for packet sampling in eBPF. Currently, this is only possible in classic BPF and useful to combine sampling with f.e. packet sockets, possible also with tc. Example function proto-type looks like: u32 (*prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 2 ++ kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80f2e0fc3d02..50bf95e29a96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -154,4 +154,6 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_get_prandom_u32_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3fa1af8a58d7..1c2ca2b477c8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 50603aec766a..c1dbbb5d289b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -661,6 +661,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a3c7701a8b5e..95eb59a045ea 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ */ #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -87,3 +88,14 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 7a4eb7030dba..4344db39af2e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1139,6 +1139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } -- cgit v1.2.3 From c04167ce2ca0ecaeaafef006cb0d65cf01b68e42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:17 +0100 Subject: ebpf: add helper for obtaining current processor id This patch adds the possibility to obtain raw_smp_processor_id() in eBPF. Currently, this is only possible in classic BPF where commit da2033c28226 ("filter: add SKF_AD_RXHASH and SKF_AD_CPU") has added facilities for this. Perhaps most importantly, this would also allow us to track per CPU statistics with eBPF maps, or to implement a poor-man's per CPU data structure through eBPF maps. Example function proto-type looks like: u32 (*smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50bf95e29a96..30bfd331882a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,5 +155,6 @@ extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; +extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1c2ca2b477c8..de1f63668daf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ + BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1dbbb5d289b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -662,6 +662,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 95eb59a045ea..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -99,3 +100,14 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 4344db39af2e..33310eee6134 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1141,6 +1141,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return NULL; } -- cgit v1.2.3 From 9bac3d6d548e5cc925570b263f35b70a00a00ffd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 13 Mar 2015 11:57:42 -0700 Subject: bpf: allow extended BPF programs access skb fields introduce user accessible mirror of in-kernel 'struct sk_buff': struct __sk_buff { __u32 len; __u32 pkt_type; __u32 mark; __u32 queue_mapping; }; bpf programs can do: int bpf_prog(struct __sk_buff *skb) { __u32 var = skb->pkt_type; which will be compiled to bpf assembler as: dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type) bpf verifier will check validity of access and will convert it to: dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset)) dst_reg &= 7 since skb->pkt_type is a bitfield. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 +- include/uapi/linux/bpf.h | 10 ++++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 152 ++++++++++++++++++++++++++++++++++++++++++----- net/core/filter.c | 100 +++++++++++++++++++++++++------ 5 files changed, 234 insertions(+), 35 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30bfd331882a..280a315de8d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,6 +103,9 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type); + + u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn); }; struct bpf_prog_type_list { @@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f); void bpf_map_put(struct bpf_map *map); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index de1f63668daf..929545a27546 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,4 +170,14 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 669719ccc9ee..ea75c654af1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6b522496250..c22ebd36fa4b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + + if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1968,6 +2083,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1983,18 +2102,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -2006,11 +2125,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/net/core/filter.c b/net/core/filter.c index 33310eee6134..4e9dd0ad0d5b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,10 +150,43 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return prandom_u32(); } +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + } + + return insn - insn_buf; +} + static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; + u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -167,13 +200,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - PKT_TYPE_OFFSET()); - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); -#ifdef __BIG_ENDIAN_BITFIELD - insn++; - *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); -#endif + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: @@ -197,10 +225,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - - *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, mark)); + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: @@ -211,10 +237,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - - *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, queue_mapping)); + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: @@ -1151,13 +1175,55 @@ sk_filter_func_proto(enum bpf_func_id func_id) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - /* skb fields cannot be accessed yet */ - return false; + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* check bounds */ + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + /* all __sk_buff fields are __u32 */ + if (size != 4) + return false; + + return true; +} + +static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + + case offsetof(struct __sk_buff, mark): + return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, pkt_type): + return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, queue_mapping): + return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + } + + return insn - insn_buf; } static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { -- cgit v1.2.3 From c24973957975403521ca76a776c2dfd12fbe9add Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 16 Mar 2015 18:06:02 -0700 Subject: bpf: allow BPF programs access 'protocol' and 'vlan_tci' fields as a follow on to patch 70006af95515 ("bpf: allow eBPF access skb fields") this patch allows 'protocol' and 'vlan_tci' fields to be accessible from extended BPF programs. The usage of 'protocol', 'vlan_present' and 'vlan_tci' fields is the same as corresponding SKF_AD_PROTOCOL, SKF_AD_VLAN_TAG_PRESENT and SKF_AD_VLAN_TAG accesses in classic BPF. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 ++ net/core/filter.c | 72 +++++++++++++++++++++++++++++++-------------- samples/bpf/test_verifier.c | 9 ++++++ 3 files changed, 62 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929545a27546..1623047af463 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -178,6 +178,9 @@ struct __sk_buff { __u32 pkt_type; __u32 mark; __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 4e9dd0ad0d5b..b95ae7fe7e4f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -177,6 +177,35 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; + + case SKF_AD_PROTOCOL: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + /* dst_reg = *(u16 *) (src_reg + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, protocol)); + /* dst_reg = ntohs(dst_reg) [emitting a nop or swap16] */ + *insn++ = BPF_ENDIAN(BPF_FROM_BE, dst_reg, 16); + break; + + case SKF_AD_VLAN_TAG: + case SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + if (skb_field == SKF_AD_VLAN_TAG) { + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, + ~VLAN_TAG_PRESENT); + } else { + /* dst_reg >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); + /* dst_reg &= 1 */ + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + } + break; } return insn - insn_buf; @@ -190,13 +219,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - - /* A = *(u16 *) (CTX + offsetof(protocol)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, protocol)); - /* A = ntohs(A) [emitting a nop or swap16] */ - *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + cnt = convert_skb_access(SKF_AD_PROTOCOL, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PKTTYPE: @@ -242,22 +266,15 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: - case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; - /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, vlan_tci)); - if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, - ~VLAN_TAG_PRESENT); - } else { - /* A >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); - /* A &= 1 */ - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); - } + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: @@ -1215,6 +1232,17 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, case offsetof(struct __sk_buff, queue_mapping): return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, protocol): + return convert_skb_access(SKF_AD_PROTOCOL, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_present): + return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_tci): + return convert_skb_access(SKF_AD_VLAN_TAG, + dst_reg, src_reg, insn); } return insn - insn_buf; diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index df6dbb6576f6..75d561f9fd6a 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -658,6 +658,15 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, queue_mapping)), BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, protocol)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_present)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_tci)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), }, .result = ACCEPT, -- cgit v1.2.3 From db24a9044ee191c397dcd1c6574f56d67d7c8df5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Mar 2015 20:23:15 -0600 Subject: net: add support for phys_port_name Similar to port id allow netdevices to specify port names and export the name via sysfs. Drivers can implement the netdevice operation to assist udev in having sane default names for the devices using the rule: $ cat /etc/udev/rules.d/80-net-setup-link.rules SUBSYSTEM=="net", ACTION=="add", ATTR{phys_port_name}!="", NAME="$attr{phys_port_name}" Use of phys_name versus phys_id was suggested-by Jiri Pirko. Signed-off-by: David Ahern Acked-by: Jiri Pirko Acked-by: Scott Feldman Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 8 ++++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + net/core/dev.c | 18 ++++++++++++++++++ net/core/net-sysfs.c | 23 +++++++++++++++++++++++ net/core/rtnetlink.c | 21 +++++++++++++++++++++ 6 files changed, 75 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index beb8ec4dabbc..5ecfd72ba684 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -188,6 +188,14 @@ Description: Indicates the interface unique physical port identifier within the NIC, as a string. +What: /sys/class/net//phys_port_name +Date: March 2015 +KernelVersion: 4.0 +Contact: netdev@vger.kernel.org +Description: + Indicates the interface physical port name within the NIC, + as a string. + What: /sys/class/net//speed Date: October 2009 KernelVersion: 2.6.33 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 76c5de4978a8..ec8f9b5f6500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1164,6 +1164,8 @@ struct net_device_ops { bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); + int (*ndo_get_phys_port_name)(struct net_device *dev, + char *name, size_t len); void (*ndo_add_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); @@ -2947,6 +2949,8 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 756436e1ce89..7158fd00a109 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -147,6 +147,7 @@ enum { IFLA_CARRIER_CHANGES, IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 39fe369b46ad..a1f24151db5b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5911,6 +5911,24 @@ int dev_get_phys_port_id(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_id); +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7e58bd7ec232..cc5cf689809c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -418,6 +418,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -465,6 +487,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..6abe634c666c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -982,6 +982,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; @@ -1072,6 +1090,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; -- cgit v1.2.3 From af615762e972be0c66cf1d156ca4fac13b93c0b0 Mon Sep 17 00:00:00 2001 From: Jörg Thalheim Date: Wed, 18 Mar 2015 10:06:58 +0100 Subject: bridge: add ageing_time, stp_state, priority over netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jörg Thalheim Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ net/bridge/br_netlink.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7158fd00a109..f5f5edd5ae5f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -225,6 +225,9 @@ enum { IFLA_BR_FORWARD_DELAY, IFLA_BR_HELLO_TIME, IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8bc6b67457dc..e1115a224a95 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -733,6 +733,9 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, + [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, + [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, + [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -762,6 +765,24 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } + if (data[IFLA_BR_AGEING_TIME]) { + u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); + + br->ageing_time = clock_t_to_jiffies(ageing_time); + } + + if (data[IFLA_BR_STP_STATE]) { + u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); + + br_stp_set_enabled(br, stp_enabled); + } + + if (data[IFLA_BR_PRIORITY]) { + u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); + + br_stp_set_bridge_priority(br, priority); + } + return 0; } @@ -770,6 +791,9 @@ static size_t br_get_size(const struct net_device *brdev) return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ 0; } @@ -779,10 +803,16 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); + u32 ageing_time = jiffies_to_clock_t(br->ageing_time); + u32 stp_enabled = br->stp_enabled; + u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || - nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || + nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || + nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || + nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) return -EMSGSIZE; return 0; -- cgit v1.2.3 From 94caee8c312d96522bcdae88791aaa9ebcd5f22c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:11 +0100 Subject: ebpf: add sched_act_type and map it to sk_filter's verifier ops In order to prepare eBPF support for tc action, we need to add sched_act_type, so that the eBPF verifier is aware of what helper function act_bpf may use, that it can load skb data and read out currently available skb fields. This is bascially analogous to 96be4325f443 ("ebpf: add sched_cls_type and map it to sk_filter's verifier ops"). BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT need to be separate since both will have a different set of functionality in future (classifier vs action), thus we won't run into ABI troubles when the point in time comes to diverge functionality from the classifier. The future plan for act_bpf would be that it will be able to write into skb->data and alter selected fields mirrored in struct __sk_buff. For an initial support, it's sufficient to map it to sk_filter_ops. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Reviewed-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1623047af463..3dd314a45d0d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c22ebd36fa4b..0e714f799ec0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1180,6 +1180,7 @@ static bool may_access_skb(enum bpf_prog_type type) switch (type) { case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: return true; default: return false; diff --git a/net/core/filter.c b/net/core/filter.c index bdaac5895def..084eacc4d1d4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1263,10 +1263,16 @@ static struct bpf_prog_type_list sched_cls_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_CLS, }; +static struct bpf_prog_type_list sched_act_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); + bpf_register_prog_type(&sched_act_type); return 0; } -- cgit v1.2.3 From a8cb5f556b567974d75ea29c15181c445c541b1f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:12 +0100 Subject: act_bpf: add initial eBPF support for actions This work extends the "classic" BPF programmable tc action by extending its scope also to native eBPF code! Together with commit e2e9b6541dd4 ("cls_bpf: add initial eBPF support for programmable classifiers") this adds the facility to implement fully flexible classifier and actions for tc that can be implemented in a C subset in user space, "safely" loaded into the kernel, and being run in native speed when JITed. Also, since eBPF maps can be shared between eBPF programs, it offers the possibility that cls_bpf and act_bpf can share data 1) between themselves and 2) between user space applications. That means that, f.e. customized runtime statistics can be collected in user space, but also more importantly classifier and action behaviour could be altered based on map input from the user space application. For the remaining details on the workflow and integration, see the cls_bpf commit e2e9b6541dd4. Preliminary iproute2 part can be found under [1]. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf-act Signed-off-by: Daniel Borkmann Cc: Jamal Hadi Salim Cc: Jiri Pirko Acked-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tc_act/tc_bpf.h | 6 +- include/uapi/linux/tc_act/tc_bpf.h | 2 + net/sched/act_bpf.c | 295 ++++++++++++++++++++++++++----------- 3 files changed, 220 insertions(+), 83 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index 86a070ffc930..a152e9858b2c 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -16,8 +16,12 @@ struct tcf_bpf { struct tcf_common common; struct bpf_prog *filter; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; struct sock_filter *bpf_ops; - u16 bpf_num_ops; + const char *bpf_name; }; #define to_bpf(a) \ container_of(a->priv, struct tcf_bpf, common) diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 5288bd77e63b..07f17cc70bb3 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -24,6 +24,8 @@ enum { TCA_ACT_BPF_PARMS, TCA_ACT_BPF_OPS_LEN, TCA_ACT_BPF_OPS, + TCA_ACT_BPF_FD, + TCA_ACT_BPF_NAME, __TCA_ACT_BPF_MAX, }; #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5f6288fa3f12..4d2cede17468 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -13,26 +13,40 @@ #include #include #include +#include + #include #include #include #include -#define BPF_TAB_MASK 15 +#define BPF_TAB_MASK 15 +#define ACT_BPF_NAME_LEN 256 + +struct tcf_bpf_cfg { + struct bpf_prog *filter; + struct sock_filter *bpf_ops; + char *bpf_name; + u32 bpf_fd; + u16 bpf_num_ops; +}; -static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, +static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; int action, filter_res; - spin_lock(&b->tcf_lock); + spin_lock(&prog->tcf_lock); - b->tcf_tm.lastuse = jiffies; - bstats_update(&b->tcf_bstats, skb); + prog->tcf_tm.lastuse = jiffies; + bstats_update(&prog->tcf_bstats, skb); - filter_res = BPF_PROG_RUN(b->filter, skb); + /* Needed here for accessing maps. */ + rcu_read_lock(); + filter_res = BPF_PROG_RUN(prog->filter, skb); + rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the @@ -52,52 +66,87 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, break; case TC_ACT_SHOT: action = filter_res; - b->tcf_qstats.drops++; + prog->tcf_qstats.drops++; break; case TC_ACT_UNSPEC: - action = b->tcf_action; + action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } - spin_unlock(&b->tcf_lock); + spin_unlock(&prog->tcf_lock); return action; } -static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *a, +static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) +{ + return !prog->bpf_ops; +} + +static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + +static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; struct tc_act_bpf opt = { - .index = b->tcf_index, - .refcnt = b->tcf_refcnt - ref, - .bindcnt = b->tcf_bindcnt - bind, - .action = b->tcf_action, + .index = prog->tcf_index, + .refcnt = prog->tcf_refcnt - ref, + .bindcnt = prog->tcf_bindcnt - bind, + .action = prog->tcf_action, }; - struct tcf_t t; - struct nlattr *nla; + struct tcf_t tm; + int ret; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, b->bpf_num_ops)) - goto nla_put_failure; - - nla = nla_reserve(skb, TCA_ACT_BPF_OPS, b->bpf_num_ops * - sizeof(struct sock_filter)); - if (!nla) + if (tcf_bpf_is_ebpf(prog)) + ret = tcf_bpf_dump_ebpf_info(prog, skb); + else + ret = tcf_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), b->bpf_ops, nla_len(nla)); + tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); + tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - t.install = jiffies_to_clock_t(jiffies - b->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - b->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(b->tcf_tm.expires); - if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(t), &t)) + if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm)) goto nla_put_failure; + return skb->len; nla_put_failure: @@ -107,36 +156,21 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, + [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; -static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { - struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; - struct tc_act_bpf *parm; - struct tcf_bpf *b; - u16 bpf_size, bpf_num_ops; struct sock_filter *bpf_ops; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; + u16 bpf_size, bpf_num_ops; int ret; - if (!nla) - return -EINVAL; - - ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); - if (ret < 0) - return ret; - - if (!tb[TCA_ACT_BPF_PARMS] || - !tb[TCA_ACT_BPF_OPS_LEN] || !tb[TCA_ACT_BPF_OPS]) - return -EINVAL; - parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; @@ -146,68 +180,165 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (!bpf_ops) + if (bpf_ops == NULL) return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto free_bpf_ops; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*b), bind); - if (ret) + cfg->bpf_ops = bpf_ops; + cfg->bpf_num_ops = bpf_num_ops; + cfg->filter = fp; + + return 0; +} + +static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_ACT_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), + nla_len(tb[TCA_ACT_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + cfg->bpf_fd = bpf_fd; + cfg->bpf_name = name; + cfg->filter = fp; + + return 0; +} + +static int tcf_bpf_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *act, + int replace, int bind) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + struct tc_act_bpf *parm; + struct tcf_bpf *prog; + struct tcf_bpf_cfg cfg; + bool is_bpf, is_ebpf; + int ret; + + if (!nla) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); + if (ret < 0) + return ret; + + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_ACT_BPF_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_ACT_BPF_PARMS]); + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + return ret; + + if (!tcf_hash_check(parm->index, act, bind)) { + ret = tcf_hash_create(parm->index, est, act, + sizeof(*prog), bind); + if (ret < 0) goto destroy_fp; ret = ACT_P_CREATED; } else { + /* Don't override defaults. */ if (bind) goto destroy_fp; - tcf_hash_release(a, bind); - if (!ovr) { + + tcf_hash_release(act, bind); + if (!replace) { ret = -EEXIST; goto destroy_fp; } } - b = to_bpf(a); - spin_lock_bh(&b->tcf_lock); - b->tcf_action = parm->action; - b->bpf_num_ops = bpf_num_ops; - b->bpf_ops = bpf_ops; - b->filter = fp; - spin_unlock_bh(&b->tcf_lock); + prog = to_bpf(act); + spin_lock_bh(&prog->tcf_lock); + + prog->bpf_ops = cfg.bpf_ops; + prog->bpf_name = cfg.bpf_name; + + if (cfg.bpf_num_ops) + prog->bpf_num_ops = cfg.bpf_num_ops; + if (cfg.bpf_fd) + prog->bpf_fd = cfg.bpf_fd; + + prog->tcf_action = parm->action; + prog->filter = cfg.filter; + + spin_unlock_bh(&prog->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(act); + return ret; destroy_fp: - bpf_prog_destroy(fp); -free_bpf_ops: - kfree(bpf_ops); + if (is_ebpf) + bpf_prog_put(cfg.filter); + else + bpf_prog_destroy(cfg.filter); + + kfree(cfg.bpf_ops); + kfree(cfg.bpf_name); + return ret; } -static void tcf_bpf_cleanup(struct tc_action *a, int bind) +static void tcf_bpf_cleanup(struct tc_action *act, int bind) { - struct tcf_bpf *b = a->priv; + const struct tcf_bpf *prog = act->priv; - bpf_prog_destroy(b->filter); + if (tcf_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); } -static struct tc_action_ops act_bpf_ops = { - .kind = "bpf", - .type = TCA_ACT_BPF, - .owner = THIS_MODULE, - .act = tcf_bpf, - .dump = tcf_bpf_dump, - .cleanup = tcf_bpf_cleanup, - .init = tcf_bpf_init, +static struct tc_action_ops act_bpf_ops __read_mostly = { + .kind = "bpf", + .type = TCA_ACT_BPF, + .owner = THIS_MODULE, + .act = tcf_bpf, + .dump = tcf_bpf_dump, + .cleanup = tcf_bpf_cleanup, + .init = tcf_bpf_init, }; static int __init bpf_init_module(void) -- cgit v1.2.3 From 8da86466b83787df0d4b89ec81c310de072d101c Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki/吉藤英明 Date: Thu, 19 Mar 2015 22:41:46 +0900 Subject: net: neighbour: Add mcast_resolicit to configure the number of multicast resolicitations in PROBE state. We send unicast neighbor (ARP or NDP) solicitations ucast_probes times in PROBE state. Zhu Yanjun reported that some implementation does not reply against them and the entry will become FAILED, which is undesirable. We had been dealt with such nodes by sending multicast probes mcast_ solicit times after unicast probes in PROBE state. In 2003, I made a change not to send them to improve compatibility with IPv6 NDP. Let's introduce per-protocol per-interface sysctl knob "mcast_ reprobe" to configure the number of multicast (re)solicitation for reconfirmation in PROBE state. The default is 0, since we have been doing so for 10+ years. Reported-by: Zhu Yanjun CC: Ulf Samuelsson Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 15 +++++++++++---- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e7bdf5170802..bd33e66f49aa 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -42,6 +42,7 @@ enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, + NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 3873a35509aa..2e35c61bbdd1 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -126,6 +126,7 @@ enum { NDTPA_PROXY_QLEN, /* u32 */ NDTPA_LOCKTIME, /* u64, msecs */ NDTPA_QUEUE_LENBYTES, /* u32 */ + NDTPA_MCAST_REPROBES, /* u32 */ __NDTPA_MAX }; #define NDTPA_MAX (__NDTPA_MAX - 1) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 0e8b32efc031..3de654256028 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -817,10 +817,9 @@ out: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); - if (!(n->nud_state & NUD_PROBE)) - max_probes += NEIGH_VAR(p, MCAST_PROBES); - return max_probes; + return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : + NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) @@ -1742,6 +1741,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_REPROBES, + NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || @@ -1901,6 +1902,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, @@ -2001,6 +2003,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; + case NDTPA_MCAST_REPROBES: + NEIGH_VAR_SET(p, MCAST_REPROBES, + nla_get_u32(tbp[i])); + break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); @@ -2987,6 +2993,7 @@ static struct neigh_sysctl_table { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), -- cgit v1.2.3 From 682f048bd49449f4ab978664a7f69a44a74e3caa Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Mon, 23 Mar 2015 09:11:13 +0300 Subject: af_packet: pass checksum validation status to the user Introduce TP_STATUS_CSUM_VALID tp_status flag to tell the af_packet user that at least the transport header checksum has been already validated. For now, the flag may be set for incoming packets only. Signed-off-by: Alexander Drozdov Cc: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 13 ++++++++++--- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 9 +++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index a6d7cb91069e..daa015af16a0 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -440,9 +440,10 @@ and the following flags apply: +++ Capture process: from include/linux/if_packet.h - #define TP_STATUS_COPY 2 - #define TP_STATUS_LOSING 4 - #define TP_STATUS_CSUMNOTREADY 8 + #define TP_STATUS_COPY (1 << 1) + #define TP_STATUS_LOSING (1 << 2) + #define TP_STATUS_CSUMNOTREADY (1 << 3) + #define TP_STATUS_CSUM_VALID (1 << 7) TP_STATUS_COPY : This flag indicates that the frame (and associated meta information) has been truncated because it's @@ -466,6 +467,12 @@ TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which reading the packet we should not try to check the checksum. +TP_STATUS_CSUM_VALID : This flag indicates that at least the transport + header checksum of the packet has been already + validated on the kernel side. If the flag is not set + then we are free to check the checksum by ourselves + provided that TP_STATUS_CSUMNOTREADY is also not set. + for convenience there are also the following defines: #define TP_STATUS_KERNEL 0 diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index da2d668b8cf1..053bd102fbe0 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -99,6 +99,7 @@ struct tpacket_auxdata { #define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ #define TP_STATUS_BLK_TMO (1 << 5) #define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ +#define TP_STATUS_CSUM_VALID (1 << 7) /* Tx ring - header status */ #define TP_STATUS_AVAILABLE 0 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9d854c5ce0b5..5102c3cc4eec 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1924,6 +1924,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + status |= TP_STATUS_CSUM_VALID; if (snaplen > res) snaplen = res; @@ -3031,6 +3035,11 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + aux.tp_status |= TP_STATUS_CSUM_VALID; + aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; -- cgit v1.2.3 From 3d1bec99320d4e96897805440f8cf4f68eff226b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:00 +0100 Subject: ipv6: introduce secret_stable to ipv6_devconf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the procfs logic for the stable_address knob: The secret is formatted as an ipv6 address and will be stored per interface and per namespace. We track initialized flag and return EIO errors until the secret is set. We don't inherit the secret to newly created namespaces. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/ipv6.h | 4 +++ include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4d5169f5d7d1..82806c60aa42 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -53,6 +53,10 @@ struct ipv6_devconf { __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; + struct ipv6_stable_secret { + bool initialized; + struct in6_addr secret; + } stable_secret; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 437a6a4b125a..5efa54ae567c 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -170,6 +170,7 @@ enum { DEVCONF_ACCEPT_RA_FROM_LOCAL, DEVCONF_USE_OPTIMISTIC, DEVCONF_ACCEPT_RA_MTU, + DEVCONF_STABLE_SECRET, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 158378e73f0a..5b967c8a617a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ #define INFINITY_LIFE_TIME 0xFFFFFFFF +#define IPV6_MAX_STRLEN \ + sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") + static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; @@ -202,6 +206,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + } }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -240,6 +247,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + }, }; /* Check if a valid qdisc is available */ @@ -4430,6 +4440,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + /* we omit DEVCONF_STABLE_SECRET for now */ } static inline size_t inet6_ifla6_size(void) @@ -5074,6 +5085,53 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, return ret; } +static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + struct in6_addr addr; + char str[IPV6_MAX_STRLEN]; + struct ctl_table lctl = *ctl; + struct ipv6_stable_secret *secret = ctl->data; + + lctl.maxlen = IPV6_MAX_STRLEN; + lctl.data = str; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (!write && !secret->initialized) { + err = -EIO; + goto out; + } + + if (!write) { + err = snprintf(str, sizeof(str), "%pI6", + &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; + } + } + + err = proc_dostring(&lctl, write, buffer, lenp, ppos); + if (err || !write) + goto out; + + if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) { + err = -EIO; + goto out; + } + + secret->initialized = true; + secret->secret = addr; + +out: + rtnl_unlock(); + + return err; +} static struct addrconf_sysctl_table { @@ -5346,6 +5404,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "stable_secret", + .data = &ipv6_devconf.stable_secret, + .maxlen = IPV6_MAX_STRLEN, + .mode = 0600, + .proc_handler = addrconf_sysctl_stable_secret, + }, { /* sentinel */ } @@ -5442,6 +5507,9 @@ static int __net_init addrconf_init_net(struct net *net) dflt->autoconf = ipv6_defaults.autoconf; dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; + dflt->stable_secret.initialized = false; + all->stable_secret.initialized = false; + net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; -- cgit v1.2.3 From 622c81d57b392cc9be836670eb464a4dfaa9adfe Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:01 +0100 Subject: ipv6: generation of stable privacy addresses for link-local and autoconf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the stable privacy address generation for link-local and autoconf addresses as specified in RFC7217. RID = F(Prefix, Net_Iface, Network_ID, DAD_Counter, secret_key) is the RID (random identifier). As the hash function F we chose one round of sha1. Prefix will be either the link-local prefix or the router advertised one. As Net_Iface we use the MAC address of the device. DAD_Counter and secret_key are implemented as specified. We don't use Network_ID, as it couples the code too closely to other subsystems. It is specified as optional in the RFC. As Net_Iface we only use the MAC address: we simply have no stable identifier in the kernel we could possibly use: because this code might run very early, we cannot depend on names, as they might be changed by user space early on during the boot process. A new address generation mode is introduced, IN6_ADDR_GEN_MODE_STABLE_PRIVACY. With iproute2 one can switch back to none or eui64 address configuration mode although the stable_secret is already set. We refuse writes to ipv6/conf/all/stable_secret but only allow ipv6/conf/default/stable_secret and the interface specific file to be written to. The default stable_secret is used as the parameter for the namespace, the interface specific can overwrite the secret, e.g. when switching a network configuration from one system to another while inheriting the secret. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/ipv6/addrconf.c | 130 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 127 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f5f5edd5ae5f..7ffb18df01ca 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -216,6 +216,7 @@ enum { enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, }; /* Bridge section */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5b967c8a617a..6813268ce8b8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -131,6 +131,9 @@ static void ipv6_regen_rndid(unsigned long data); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_generate_stable_address(struct in6_addr *addr, + u8 dad_count, + const struct inet6_dev *idev); /* * Configured unicast address hash table @@ -2302,6 +2305,11 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; + } else if (in6_dev->addr_gen_mode == + IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !ipv6_generate_stable_address(&addr, 0, + in6_dev)) { + goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); @@ -2820,12 +2828,98 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static bool ipv6_reserved_interfaceid(struct in6_addr address) +{ + if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0) + return true; + + if (address.s6_addr32[2] == htonl(0x02005eff) && + ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000))) + return true; + + if (address.s6_addr32[2] == htonl(0xfdffffff) && + ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80))) + return true; + + return false; +} + +static int ipv6_generate_stable_address(struct in6_addr *address, + u8 dad_count, + const struct inet6_dev *idev) +{ + static const int idgen_retries = 3; + + static DEFINE_SPINLOCK(lock); + static __u32 digest[SHA_DIGEST_WORDS]; + static __u32 workspace[SHA_WORKSPACE_WORDS]; + + static union { + char __data[SHA_MESSAGE_BYTES]; + struct { + struct in6_addr secret; + __be64 prefix; + unsigned char hwaddr[MAX_ADDR_LEN]; + u8 dad_count; + } __packed; + } data; + + struct in6_addr secret; + struct in6_addr temp; + struct net *net = dev_net(idev->dev); + + BUILD_BUG_ON(sizeof(data.__data) != sizeof(data)); + + if (idev->cnf.stable_secret.initialized) + secret = idev->cnf.stable_secret.secret; + else if (net->ipv6.devconf_dflt->stable_secret.initialized) + secret = net->ipv6.devconf_dflt->stable_secret.secret; + else + return -1; + +retry: + spin_lock_bh(&lock); + + sha_init(digest); + memset(&data, 0, sizeof(data)); + memset(workspace, 0, sizeof(workspace)); + memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); + data.prefix = ((__be64)address->s6_addr32[0] << 32) | + (__be64)address->s6_addr32[1]; + data.secret = secret; + data.dad_count = dad_count; + + sha_transform(digest, data.__data, workspace); + + temp = *address; + temp.s6_addr32[2] = digest[0]; + temp.s6_addr32[3] = digest[1]; + + spin_unlock_bh(&lock); + + if (ipv6_reserved_interfaceid(temp)) { + dad_count++; + if (dad_count > idgen_retries) + return -1; + goto retry; + } + + *address = temp; + return 0; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { - struct in6_addr addr; + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + if (!ipv6_generate_stable_address(&addr, 0, idev)) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. @@ -4675,8 +4769,15 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); if (mode != IN6_ADDR_GEN_MODE_EUI64 && - mode != IN6_ADDR_GEN_MODE_NONE) + mode != IN6_ADDR_GEN_MODE_NONE && + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + return -EINVAL; + + if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !idev->cnf.stable_secret.initialized && + !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized) return -EINVAL; + idev->addr_gen_mode = mode; err = 0; } @@ -5093,8 +5194,12 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct in6_addr addr; char str[IPV6_MAX_STRLEN]; struct ctl_table lctl = *ctl; + struct net *net = ctl->extra2; struct ipv6_stable_secret *secret = ctl->data; + if (&net->ipv6.devconf_all->stable_secret == ctl->data) + return -EIO; + lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; @@ -5127,6 +5232,23 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, secret->initialized = true; secret->secret = addr; + if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) { + struct net_device *dev; + + for_each_netdev(net, dev) { + struct inet6_dev *idev = __in6_dev_get(dev); + + if (idev) { + idev->addr_gen_mode = + IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + } + } else { + struct inet6_dev *idev = ctl->extra1; + + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + out: rtnl_unlock(); -- cgit v1.2.3 From 64236f3f3d742469e4027b83a9515e84e9ab21b4 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:02 +0100 Subject: ipv6: introduce IFA_F_STABLE_PRIVACY flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to mark appropriate addresses so we can do retries in case their DAD failed. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_addr.h | 1 + net/ipv6/addrconf.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 40fdfea39714..4318ab1635ce 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -51,6 +51,7 @@ enum { #define IFA_F_MANAGETEMPADDR 0x100 #define IFA_F_NOPREFIXROUTE 0x200 #define IFA_F_MCAUTOJOIN 0x400 +#define IFA_F_STABLE_PRIVACY 0x800 struct ifa_cacheinfo { __u32 ifa_prefered; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6813268ce8b8..c2357b6f62dd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2199,6 +2199,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) __u32 valid_lft; __u32 prefered_lft; int addr_type; + u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2309,6 +2310,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) IN6_ADDR_GEN_MODE_STABLE_PRIVACY && !ipv6_generate_stable_address(&addr, 0, in6_dev)) { + addr_flags |= IFA_F_STABLE_PRIVACY; goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { @@ -2328,7 +2330,6 @@ ok: if (ifp == NULL && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; - u32 addr_flags = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && @@ -2807,10 +2808,11 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) +static void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; - u32 addr_flags = IFA_F_PERMANENT; + u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (idev->cnf.optimistic_dad && @@ -2818,7 +2820,6 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr addr_flags |= IFA_F_OPTIMISTIC; #endif - ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { @@ -2916,7 +2917,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { if (!ipv6_generate_stable_address(&addr, 0, idev)) - addrconf_add_linklocal(idev, &addr); + addrconf_add_linklocal(idev, &addr, + IFA_F_STABLE_PRIVACY); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { @@ -2925,7 +2927,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) * couldn't generate one. */ if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); } -- cgit v1.2.3 From 27cd5452476978283decb19e429e81fc6c71e74b Mon Sep 17 00:00:00 2001 From: Michal Sekletar Date: Tue, 24 Mar 2015 14:48:41 +0100 Subject: filter: introduce SKF_AD_VLAN_TPID BPF extension If vlan offloading takes place then vlan header is removed from frame and its contents, both vlan_tci and vlan_proto, is available to user space via TPACKET interface. However, only vlan_tci can be used in BPF filters. This commit introduces a new BPF extension. It makes possible to load the value of vlan_proto (vlan TPID) to register A. Support for classic BPF and eBPF is being added, analogous to skb->protocol. Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Jiri Pirko Signed-off-by: Michal Sekletar Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 3 ++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/filter.h | 3 ++- net/core/filter.c | 17 +++++++++++++++++ tools/net/bpf_exp.l | 2 ++ tools/net/bpf_exp.y | 11 ++++++++++- 7 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 9930ecfbb465..135581f015e1 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -280,7 +280,8 @@ Possible BPF extensions are shown in the following table: rxhash skb->hash cpu raw_smp_processor_id() vlan_tci skb_vlan_tag_get(skb) - vlan_pr skb_vlan_tag_present(skb) + vlan_avail skb_vlan_tag_present(skb) + vlan_tpid skb->vlan_proto rand prandom_u32() These extensions can also be prefixed with '#'. diff --git a/include/linux/filter.h b/include/linux/filter.h index 9ee8c67ea249..fa11b3a367be 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -454,6 +454,7 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest) BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); + BPF_ANCILLARY(VLAN_TPID); } /* Fallthrough. */ default: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3dd314a45d0d..27dc4ec58840 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -182,6 +182,7 @@ struct __sk_buff { __u32 protocol; __u32 vlan_present; __u32 vlan_tci; + __u32 vlan_proto; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h index 47785d5ecf17..34c7936ca114 100644 --- a/include/uapi/linux/filter.h +++ b/include/uapi/linux/filter.h @@ -77,7 +77,8 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ #define SKF_AD_VLAN_TAG_PRESENT 48 #define SKF_AD_PAY_OFFSET 52 #define SKF_AD_RANDOM 56 -#define SKF_AD_MAX 60 +#define SKF_AD_VLAN_TPID 60 +#define SKF_AD_MAX 64 #define SKF_NET_OFF (-0x100000) #define SKF_LL_OFF (-0x200000) diff --git a/net/core/filter.c b/net/core/filter.c index 084eacc4d1d4..32f43c59908c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -272,6 +272,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, insn += cnt - 1; break; + case SKF_AD_OFF + SKF_AD_VLAN_TPID: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_proto)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: @@ -1226,6 +1236,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, offsetof(struct sk_buff, protocol)); break; + case offsetof(struct __sk_buff, vlan_proto): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_proto)); + break; + case offsetof(struct __sk_buff, mark): return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l index 833a96611da6..c83af3fb77de 100644 --- a/tools/net/bpf_exp.l +++ b/tools/net/bpf_exp.l @@ -92,6 +92,8 @@ extern void yyerror(const char *str); "#"?("cpu") { return K_CPU; } "#"?("vlan_tci") { return K_VLANT; } "#"?("vlan_pr") { return K_VLANP; } +"#"?("vlan_avail") { return K_VLANP; } +"#"?("vlan_tpid") { return K_VLANTPID; } "#"?("rand") { return K_RAND; } ":" { return ':'; } diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y index e6306c51c26f..f8332749b44c 100644 --- a/tools/net/bpf_exp.y +++ b/tools/net/bpf_exp.y @@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type); %token OP_LDXI %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE -%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND +%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_VLANTPID K_POFF K_RAND %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%' @@ -167,6 +167,9 @@ ldb | OP_LDB K_RAND { bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LDB K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } ; ldh @@ -218,6 +221,9 @@ ldh | OP_LDH K_RAND { bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LDH K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } ; ldi @@ -274,6 +280,9 @@ ld | OP_LD K_RAND { bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LD K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } | OP_LD 'M' '[' number ']' { bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); } | OP_LD '[' 'x' '+' number ']' { -- cgit v1.2.3 From 608cd71a9c7c9db76e78a792c5a4101e12fea43f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 26 Mar 2015 19:53:57 -0700 Subject: tc: bpf: generalize pedit action existing TC action 'pedit' can munge any bits of the packet. Generalize it for use in bpf programs attached as cls_bpf and act_bpf via bpf_skb_store_bytes() helper function. Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 2 ++ net/core/filter.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 73 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 280a315de8d6..d5cda067115a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -59,6 +59,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 27dc4ec58840..74aab6e0d964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ + BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e714f799ec0..630a7bac1e51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -773,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 32f43c59908c..444a07e4f68d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,6 +1175,56 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + unsigned int offset = (unsigned int) r2; + void *from = (void *) (long) r3; + unsigned int len = (unsigned int) r4; + char buf[16]; + void *ptr; + + /* bpf verifier guarantees that: + * 'from' pointer points to bpf program stack + * 'len' bytes of it were initialized + * 'len' > 0 + * 'skb' is a valid pointer to 'struct sk_buff' + * + * so check for invalid 'offset' and too large 'len' + */ + if (offset > 0xffff || len > sizeof(buf)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, buf); + if (unlikely(!ptr)) + return -EFAULT; + + skb_postpull_rcsum(skb, ptr, len); + + memcpy(ptr, from, len); + + if (ptr == buf) + /* skb_store_bits cannot return -EFAULT here */ + skb_store_bits(skb, offset, ptr, len); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + return 0; +} + +const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1194,6 +1244,17 @@ sk_filter_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { @@ -1270,18 +1331,24 @@ static const struct bpf_verifier_ops sk_filter_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops tc_cls_act_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_ACT, }; -- cgit v1.2.3 From 3a323d4e17dd5a84f6ad036e6f985d263ca973ed Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Wed, 18 Mar 2015 10:47:02 +0200 Subject: nl80211: small clarification of the sched_scan delay attribute Just clarify that the delay is only before the first cycle. Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ae16ba9cb1e3..241220c43e86 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1754,8 +1754,9 @@ enum nl80211_commands { * should be contained in the result as the sum of the respective counters * over all channels. * - * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before a scheduled scan (or a - * WoWLAN net-detect scan) is started, u32 in seconds. + * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a + * scheduled scan (or a WoWLAN net-detect scan) is started, u32 + * in seconds. * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device * is operating in an indoor environment. -- cgit v1.2.3 From 15e318bdc6dfb82914c82fb7ad00badaa8387d8e Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Sun, 29 Mar 2015 16:59:24 +0200 Subject: xfrm: simplify xfrm_address_t use In many places, the a6 field is typecasted to struct in6_addr. As the fields are in union anyway, just add in6_addr type to the union and get rid of the typecasting. Modifying the uapi header is okay, the union has still the same size. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/xfrm.h | 6 +++--- include/uapi/linux/xfrm.h | 2 ++ net/ipv6/xfrm6_mode_beet.c | 4 ++-- net/ipv6/xfrm6_policy.c | 4 +--- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d0ac7d7be8a7..461f83539493 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1025,7 +1025,7 @@ xfrm_addr_any(const xfrm_address_t *addr, unsigned short family) case AF_INET: return addr->a4 == 0; case AF_INET6: - return ipv6_addr_any((struct in6_addr *)&addr->a6); + return ipv6_addr_any(&addr->in6); } return 0; } @@ -1238,8 +1238,8 @@ void xfrm_flowi_addr_get(const struct flowi *fl, memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4)); break; case AF_INET6: - *(struct in6_addr *)saddr->a6 = fl->u.ip6.saddr; - *(struct in6_addr *)daddr->a6 = fl->u.ip6.daddr; + saddr->in6 = fl->u.ip6.saddr; + daddr->in6 = fl->u.ip6.daddr; break; } } diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 02d5125a5ee8..2cd9e608d0d1 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_XFRM_H #define _LINUX_XFRM_H +#include #include /* All of the structures in this file may not change size as they are @@ -13,6 +14,7 @@ typedef union { __be32 a4; __be32 a6[4]; + struct in6_addr in6; } xfrm_address_t; /* Ident of a specific xfrm_state. It is used on input to lookup diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index 9949a356d62c..1e205c3253ac 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -95,8 +95,8 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; - ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; + ip6h->daddr = x->sel.daddr.in6; + ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 91d934c22a2a..f337a908a76a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -61,9 +61,7 @@ static int xfrm6_get_saddr(struct net *net, return -EHOSTUNREACH; dev = ip6_dst_idev(dst)->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, - (struct in6_addr *)&daddr->a6, 0, - (struct in6_addr *)&saddr->a6); + ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 9255fd9d94bc..f0d52d721b3a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -709,7 +709,7 @@ static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; - sin6->sin6_addr = *(struct in6_addr *)xaddr->a6; + sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index de971b6d38c5..f5e39e35d73a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1043,12 +1043,12 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; case AF_INET6: - *(struct in6_addr *)x->sel.daddr.a6 = *(struct in6_addr *)daddr; - *(struct in6_addr *)x->sel.saddr.a6 = *(struct in6_addr *)saddr; + x->sel.daddr.in6 = daddr->in6; + x->sel.saddr.in6 = saddr->in6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; - *(struct in6_addr *)x->props.saddr.a6 = *(struct in6_addr *)saddr; - *(struct in6_addr *)x->id.daddr.a6 = *(struct in6_addr *)daddr; + x->props.saddr.in6 = saddr->in6; + x->id.daddr.in6 = daddr->in6; break; } -- cgit v1.2.3 From 761da2935d6e18d178582dbdf315a3a458555505 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 26 Mar 2015 12:39:36 +0000 Subject: netfilter: nf_tables: add set timeout API support Add set timeout support to the netlink API. Sets with timeout support enabled can have a default timeout value and garbage collection interval specified. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +++++++++ include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b8cd60dcb4e1..8936803a2ad5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -258,6 +258,8 @@ void nft_unregister_set(struct nft_set_ops *ops); * @dtype: data type (verdict or numeric type defined by userspace) * @size: maximum set size * @nelems: number of elements + * @timeout: default timeout value in msecs + * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @ops: set ops * @pnet: network namespace @@ -274,6 +276,8 @@ struct nft_set { u32 dtype; u32 size; u32 nelems; + u64 timeout; + u32 gc_int; u16 policy; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; @@ -295,6 +299,11 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla); +static inline unsigned long nft_set_gc_interval(const struct nft_set *set) +{ + return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; +} + /** * struct nft_set_binding - nf_tables set binding * diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b9783931503b..971d245e7378 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -208,12 +208,14 @@ enum nft_rule_compat_attributes { * @NFT_SET_CONSTANT: set contents may not change while bound * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary + * @NFT_SET_TIMEOUT: set uses timeouts */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, NFT_SET_CONSTANT = 0x2, NFT_SET_INTERVAL = 0x4, NFT_SET_MAP = 0x8, + NFT_SET_TIMEOUT = 0x10, }; /** @@ -252,6 +254,8 @@ enum nft_set_desc_attributes { * @NFTA_SET_POLICY: selection policy (NLA_U32) * @NFTA_SET_DESC: set description (NLA_NESTED) * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) + * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) + * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -265,6 +269,8 @@ enum nft_set_attributes { NFTA_SET_POLICY, NFTA_SET_DESC, NFTA_SET_ID, + NFTA_SET_TIMEOUT, + NFTA_SET_GC_INTERVAL, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5604c2df05d1..6320b64e773e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2216,6 +2216,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_POLICY] = { .type = NLA_U32 }, [NFTA_SET_DESC] = { .type = NLA_NESTED }, [NFTA_SET_ID] = { .type = NLA_U32 }, + [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2366,6 +2368,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->timeout && + nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout))) + goto nla_put_failure; + if (set->gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + goto nla_put_failure; + if (set->policy != NFT_SET_POL_PERFORMANCE) { if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) goto nla_put_failure; @@ -2578,7 +2587,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, dtype, flags, policy; + u64 timeout; + u32 ktype, dtype, flags, policy, gc_int; struct nft_set_desc desc; int err; @@ -2605,7 +2615,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | - NFT_SET_INTERVAL | NFT_SET_MAP)) + NFT_SET_INTERVAL | NFT_SET_MAP | + NFT_SET_TIMEOUT)) return -EINVAL; } @@ -2631,6 +2642,19 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } else if (flags & NFT_SET_MAP) return -EINVAL; + timeout = 0; + if (nla[NFTA_SET_TIMEOUT] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT])); + } + gc_int = 0; + if (nla[NFTA_SET_GC_INTERVAL] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + } + policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); @@ -2699,6 +2723,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->flags = flags; set->size = desc.size; set->policy = policy; + set->timeout = timeout; + set->gc_int = gc_int; err = ops->init(set, &desc, nla); if (err < 0) -- cgit v1.2.3 From c3e1b005ed1cc068fc9d454a6e745830d55d251d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 26 Mar 2015 12:39:37 +0000 Subject: netfilter: nf_tables: add set element timeout support Add API support for set element timeouts. Elements can have a individual timeout value specified, overriding the sets' default. Two new extension types are used for timeouts - the timeout value and the expiration time. The timeout value only exists if it differs from the default value. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 20 ++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 4 +++ net/netfilter/nf_tables_api.c | 53 ++++++++++++++++++++++++++++++-- 3 files changed, 75 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8936803a2ad5..f2726c537248 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -329,12 +329,16 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags + * @NFT_SET_EXT_TIMEOUT: element timeout + * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, + NFT_SET_EXT_TIMEOUT, + NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_NUM }; @@ -431,6 +435,22 @@ static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } +static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); +} + +static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); +} + +static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +{ + return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && + time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); +} + static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *elem) { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 971d245e7378..83441cc4594b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -290,12 +290,16 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_KEY: key value (NLA_NESTED: nft_data) * @NFTA_SET_ELEM_DATA: data value of mapping (NLA_NESTED: nft_data_attributes) * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32) + * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) + * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, NFTA_SET_ELEM_KEY, NFTA_SET_ELEM_DATA, NFTA_SET_ELEM_FLAGS, + NFTA_SET_ELEM_TIMEOUT, + NFTA_SET_ELEM_EXPIRATION, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6320b64e773e..9e032dbc149c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2863,6 +2863,14 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(u8), .align = __alignof__(u8), }, + [NFT_SET_EXT_TIMEOUT] = { + .len = sizeof(u64), + .align = __alignof__(u64), + }, + [NFT_SET_EXT_EXPIRATION] = { + .len = sizeof(unsigned long), + .align = __alignof__(unsigned long), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -2874,6 +2882,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, + [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -2935,6 +2944,25 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + cpu_to_be64(*nft_set_ext_timeout(ext)))) + goto nla_put_failure; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + unsigned long expires, now = jiffies; + + expires = *nft_set_ext_expiration(ext); + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + cpu_to_be64(jiffies_to_msecs(expires)))) + goto nla_put_failure; + } + nla_nest_end(skb, nest); return 0; @@ -3158,7 +3186,7 @@ static void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const struct nft_data *key, const struct nft_data *data, - gfp_t gfp) + u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -3173,6 +3201,11 @@ static void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) + *nft_set_ext_expiration(ext) = + jiffies + msecs_to_jiffies(timeout); + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) + *nft_set_ext_timeout(ext) = timeout; return elem; } @@ -3201,6 +3234,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u64 timeout; u32 flags; int err; @@ -3241,6 +3275,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return -EINVAL; } + timeout = 0; + if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT])); + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = set->timeout; + } + err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3249,6 +3292,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err2; nft_set_ext_add(&tmpl, NFT_SET_EXT_KEY); + if (timeout > 0) { + nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (timeout != set->timeout) + nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); + } if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_data_init(ctx, &data, &d2, nla[NFTA_SET_ELEM_DATA]); @@ -3277,7 +3325,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, + timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; -- cgit v1.2.3 From a5581ef4c2eac6449188862e903eb46c7233582a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 1 Apr 2015 07:50:29 +0200 Subject: can: introduce new raw socket option to join the given CAN filters The CAN_RAW socket can set multiple CAN identifier specific filters that lead to multiple filters in the af_can.c filter processing. These filters are indenpendent from each other which leads to logical OR'ed filters when applied. This socket option joines the given CAN filters in the way that only CAN frames are passed to user space that matched *all* given CAN filters. The semantic for the applied filters is therefore changed to a logical AND. This is useful especially when the filterset is a combination of filters where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or CAN ID ranges from the incoming traffic. As the raw_rcv() function is executed from NET_RX softirq the introduced variables are implemented as per-CPU variables to avoid extensive locking at CAN frame reception time. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- Documentation/networking/can.txt | 20 ++++++++++++++++++-- include/uapi/linux/can/raw.h | 1 + net/can/raw.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt index 0a2859a8ee7e..5abad1e921ca 100644 --- a/Documentation/networking/can.txt +++ b/Documentation/networking/can.txt @@ -22,7 +22,8 @@ This file contains 4.1.3 RAW socket option CAN_RAW_LOOPBACK 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS 4.1.5 RAW socket option CAN_RAW_FD_FRAMES - 4.1.6 RAW socket returned message flags + 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS + 4.1.7 RAW socket returned message flags 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM) 4.2.1 Broadcast Manager operations 4.2.2 Broadcast Manager message flags @@ -601,7 +602,22 @@ solution for a couple of reasons: CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU. The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. - 4.1.6 RAW socket returned message flags + 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS + + The CAN_RAW socket can set multiple CAN identifier specific filters that + lead to multiple filters in the af_can.c filter processing. These filters + are indenpendent from each other which leads to logical OR'ed filters when + applied (see 4.1.1). + + This socket option joines the given CAN filters in the way that only CAN + frames are passed to user space that matched *all* given CAN filters. The + semantic for the applied filters is therefore changed to a logical AND. + + This is useful especially when the filterset is a combination of filters + where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or + CAN ID ranges from the incoming traffic. + + 4.1.7 RAW socket returned message flags When using recvmsg() call, the msg->msg_flags may contain following flags: diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 78ec76fd89a6..8735f1080385 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -57,6 +57,7 @@ enum { CAN_RAW_LOOPBACK, /* local loopback (default:on) */ CAN_RAW_RECV_OWN_MSGS, /* receive my own msgs (default:off) */ CAN_RAW_FD_FRAMES, /* allow CAN FD frames (default:off) */ + CAN_RAW_JOIN_FILTERS, /* all filters must match to trigger */ }; #endif /* !_UAPI_CAN_RAW_H */ diff --git a/net/can/raw.c b/net/can/raw.c index 0c8d537b59b8..31b9748cbb4e 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -77,6 +77,7 @@ MODULE_ALIAS("can-proto-1"); struct uniqframe { ktime_t tstamp; const struct sk_buff *skb; + unsigned int join_rx_count; }; struct raw_sock { @@ -87,6 +88,7 @@ struct raw_sock { int loopback; int recv_own_msgs; int fd_frames; + int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ @@ -132,10 +134,21 @@ static void raw_rcv(struct sk_buff *oskb, void *data) /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) { - return; + if (ro->join_filters) { + this_cpu_inc(ro->uniq->join_rx_count); + /* drop frame until all enabled filters matched */ + if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) + return; + } else { + return; + } } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp; + this_cpu_ptr(ro->uniq)->join_rx_count = 1; + /* drop first frame to check all enabled filters? */ + if (ro->join_filters && ro->count > 1) + return; } /* clone the given skb to be able to enqueue it into the rcv queue */ @@ -311,6 +324,7 @@ static int raw_init(struct sock *sk) ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; + ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ ro->uniq = alloc_percpu(struct uniqframe); @@ -604,6 +618,15 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; + case CAN_RAW_JOIN_FILTERS: + if (optlen != sizeof(ro->join_filters)) + return -EINVAL; + + if (copy_from_user(&ro->join_filters, optval, optlen)) + return -EFAULT; + + break; + default: return -ENOPROTOOPT; } @@ -668,6 +691,12 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->fd_frames; break; + case CAN_RAW_JOIN_FILTERS: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->join_filters; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From bcad57182425426dd4aa14deb27f97acb329f3cd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2015 20:52:24 +0200 Subject: ebpf: add skb->priority to offset map for usage in {cls, act}_bpf This adds the ability to read out the skb->priority from an eBPF program, so that it can be taken into account from a tc filter or action for the use-case where the priority is not being used to directly override the filter classification in a qdisc, but to tag traffic otherwise for the classifier; the priority can be assigned from various places incl. user space, in future we may also mangle it from an eBPF program. Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 74aab6e0d964..0db8580f3cca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -184,6 +184,7 @@ struct __sk_buff { __u32 vlan_present; __u32 vlan_tci; __u32 vlan_proto; + __u32 priority; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 444a07e4f68d..955a7d77decd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1304,6 +1304,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, offsetof(struct sk_buff, vlan_proto)); break; + case offsetof(struct __sk_buff, priority): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + break; + case offsetof(struct __sk_buff, mark): return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); -- cgit v1.2.3 From 91bc4822c3d61b9bb7ef66d3b77948a4f9177954 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Apr 2015 17:12:13 -0700 Subject: tc: bpf: add checksum helpers Commit 608cd71a9c7c ("tc: bpf: generalize pedit action") has added the possibility to mangle packet data to BPF programs in the tc pipeline. This patch adds two helpers bpf_l3_csum_replace() and bpf_l4_csum_replace() for fixing up the protocol checksums after the packet mangling. It also adds 'flags' argument to bpf_skb_store_bytes() helper to avoid unnecessary checksum recomputations when BPF programs adjusting l3/l4 checksums and documents all three helpers in uapi header. Moreover, a sample program is added to show how BPF programs can make use of the mangle and csum helpers. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 38 +++++++++++++++- net/core/filter.c | 108 ++++++++++++++++++++++++++++++++++++++++++++-- samples/bpf/Makefile | 1 + samples/bpf/bpf_helpers.h | 7 +++ samples/bpf/tcbpf1_kern.c | 71 ++++++++++++++++++++++++++++++ 5 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 samples/bpf/tcbpf1_kern.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0db8580f3cca..23df3e7f8e7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,7 +168,43 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ - BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ + + /** + * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->data + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_skb_store_bytes, + + /** + * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l3_csum_replace, + + /** + * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l4_csum_replace, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 955a7d77decd..b669e75d2b36 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,7 +1175,9 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) + +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; unsigned int offset = (unsigned int) r2; @@ -1192,7 +1194,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * * so check for invalid 'offset' and too large 'len' */ - if (offset > 0xffff || len > sizeof(buf)) + if (unlikely(offset > 0xffff || len > sizeof(buf))) return -EFAULT; if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) @@ -1202,7 +1204,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) if (unlikely(!ptr)) return -EFAULT; - skb_postpull_rcsum(skb, ptr, len); + if (BPF_RECOMPUTE_CSUM(flags)) + skb_postpull_rcsum(skb, ptr, len); memcpy(ptr, from, len); @@ -1210,7 +1213,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); return 0; } @@ -1223,6 +1226,99 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_STACK, .arg4_type = ARG_CONST_STACK_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) +#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) + +static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __sum16 sum, *ptr; + + if (unlikely(offset > 0xffff)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + csum_replace2(ptr, from, to); + break; + case 4: + csum_replace4(ptr, from, to); + break; + default: + return -EINVAL; + } + + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); + + return 0; +} + +const struct bpf_func_proto bpf_l3_csum_replace_proto = { + .func = bpf_l3_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + __sum16 sum, *ptr; + + if (unlikely(offset > 0xffff)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); + break; + case 4: + inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); + break; + default: + return -EINVAL; + } + + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); + + return 0; +} + +const struct bpf_func_proto bpf_l4_csum_replace_proto = { + .func = bpf_l4_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * @@ -1250,6 +1346,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; default: return sk_filter_func_proto(func_id); } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b5b3600dcdf5..d24f51bca465 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -17,6 +17,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o always := $(hostprogs-y) always += sockex1_kern.o always += sockex2_kern.o +always += tcbpf1_kern.o HOSTCFLAGS += -I$(objtree)/usr/include diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index ca0333146006..72540ec1f003 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -37,4 +37,11 @@ struct bpf_map_def { unsigned int max_entries; }; +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l3_csum_replace; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; + #endif diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c new file mode 100644 index 000000000000..7cf3f42a6e39 --- /dev/null +++ b/samples/bpf/tcbpf1_kern.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +/* compiler workaround */ +#define _htonl __builtin_bswap32 + +static inline void set_dst_mac(struct __sk_buff *skb, char *mac) +{ + bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); +} + +/* use 1 below for ingress qdisc and 0 for egress */ +#if 0 +#undef ETH_HLEN +#define ETH_HLEN 0 +#endif + +#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) +#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) + +static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) +{ + __u8 old_tos = load_byte(skb, TOS_OFF); + + bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); + bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); +} + +#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check)) +#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr)) + +#define IS_PSEUDO 0x10 + +static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) +{ + __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); + bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); + bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); +} + +#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) +static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) +{ + __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); + bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); +} + +SEC("classifier") +int bpf_prog1(struct __sk_buff *skb) +{ + __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (proto == IPPROTO_TCP) { + set_ip_tos(skb, 8); + set_tcp_ip_src(skb, 0xA010101); + set_tcp_dest_port(skb, 5001); + } + + return 0; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 9a9634545c7051f567096117d417e9c3be24706d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 7 Apr 2015 11:51:53 +0200 Subject: netns: notify netns id events With this patch, netns ids that are created and deleted are advertised into the group RTNLGRP_NSID. Because callers of rtnl_net_notifyid() already know the id of the peer, there is no need to call __peernet2id() in rtnl_net_fill(). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 4 ++++ net/core/net_namespace.c | 52 +++++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bea910f924dd..974db03f7b1a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -134,6 +134,8 @@ enum { RTM_NEWNSID = 88, #define RTM_NEWNSID RTM_NEWNSID + RTM_DELNSID = 89, +#define RTM_DELNSID RTM_DELNSID RTM_GETNSID = 90, #define RTM_GETNSID RTM_GETNSID @@ -635,6 +637,8 @@ enum rtnetlink_groups { #define RTNLGRP_MDB RTNLGRP_MDB RTNLGRP_MPLS_ROUTE, #define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index be28afccfbbb..b3b5f22f0e90 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -148,9 +148,11 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, + int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { - int min = 0, max = 0; + int min = 0, max = 0, id; ASSERT_RTNL(); @@ -159,7 +161,11 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) max = reqid + 1; } - return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + if (id >= 0) + rtnl_net_notifyid(net, peer, RTM_NEWNSID, id); + + return id; } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -359,8 +365,10 @@ static void cleanup_net(struct work_struct *work) for_each_net(tmp) { int id = __peernet2id(tmp, net, false); - if (id >= 0) + if (id >= 0) { + rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); idr_remove(&tmp->netns_ids, id); + } } idr_destroy(&net->netns_ids); @@ -531,7 +539,8 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer) + int cmd, struct net *net, struct net *peer, + int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; @@ -546,9 +555,13 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; + if (nsid >= 0) { + id = nsid; + } else { + id = __peernet2id(net, peer, false); + if (id < 0) + id = NETNSA_NSID_NOT_ASSIGNED; + } if (nla_put_s32(skb, NETNSA_NSID, id)) goto nla_put_failure; @@ -589,7 +602,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) } err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer); + RTM_GETNSID, net, peer, -1); if (err < 0) goto err_out; @@ -603,6 +616,29 @@ out: return err; } +static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, + int id) +{ + struct sk_buff *msg; + int err = -ENOMEM; + + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + if (!msg) + goto out; + + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); + if (err < 0) + goto err_out; + + rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + return; + +err_out: + nlmsg_free(msg); +out: + rtnl_set_sk_err(net, RTNLGRP_NSID, err); +} + static int __init net_ns_init(void) { struct net_generic *ng; -- cgit v1.2.3 From 22fe54d5fefcfa98c58cc2f4607dd26d9648b3f5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 5 Apr 2015 14:41:08 +0200 Subject: netfilter: nf_tables: add support for dynamic set updates Add a new "dynset" expression for dynamic set updates. A new set op ->update() is added which, for non existant elements, invokes an initialization callback and inserts the new element. For both new or existing elements the extenstion pointer is returned to the caller to optionally perform timer updates or other actions. Element removal is not supported so far, however that seems to be a rather exotic need and can be added later on. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 17 +++ include/net/netfilter/nf_tables_core.h | 3 + include/uapi/linux/netfilter/nf_tables.h | 27 ++++ net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 10 +- net/netfilter/nf_tables_core.c | 7 + net/netfilter/nft_dynset.c | 218 +++++++++++++++++++++++++++++++ net/netfilter/nft_hash.c | 37 ++++++ 8 files changed, 315 insertions(+), 6 deletions(-) create mode 100644 net/netfilter/nft_dynset.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e7e6365c248f..38c3496f7bf2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -196,6 +196,7 @@ struct nft_set_estimate { }; struct nft_set_ext; +struct nft_expr; /** * struct nft_set_ops - nf_tables set operations @@ -218,6 +219,15 @@ struct nft_set_ops { bool (*lookup)(const struct nft_set *set, const struct nft_data *key, const struct nft_set_ext **ext); + bool (*update)(struct nft_set *set, + const struct nft_data *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_data []), + const struct nft_expr *expr, + struct nft_data data[], + const struct nft_set_ext **ext); + int (*insert)(const struct nft_set *set, const struct nft_set_elem *elem); void (*activate)(const struct nft_set *set, @@ -466,6 +476,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, return elem + set->ops->elemsize; } +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const struct nft_data *key, + const struct nft_data *data, + u64 timeout, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem); /** @@ -845,6 +860,8 @@ static inline u8 nft_genmask_cur(const struct net *net) return 1 << ACCESS_ONCE(net->nft.gencursor); } +#define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) + /* * Set element transaction helpers */ diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a75fc8e27cd6..c6f400cfaac8 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -31,6 +31,9 @@ void nft_cmp_module_exit(void); int nft_lookup_module_init(void); void nft_lookup_module_exit(void); +int nft_dynset_module_init(void); +void nft_dynset_module_exit(void); + int nft_bitwise_module_init(void); void nft_bitwise_module_exit(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 83441cc4594b..0b87b2f67fe3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -515,6 +515,33 @@ enum nft_lookup_attributes { }; #define NFTA_LOOKUP_MAX (__NFTA_LOOKUP_MAX - 1) +enum nft_dynset_ops { + NFT_DYNSET_OP_ADD, + NFT_DYNSET_OP_UPDATE, +}; + +/** + * enum nft_dynset_attributes - dynset expression attributes + * + * @NFTA_DYNSET_SET_NAME: name of set the to add data to (NLA_STRING) + * @NFTA_DYNSET_SET_ID: uniquely identifier of the set in the transaction (NLA_U32) + * @NFTA_DYNSET_OP: operation (NLA_U32) + * @NFTA_DYNSET_SREG_KEY: source register of the key (NLA_U32) + * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) + * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) + */ +enum nft_dynset_attributes { + NFTA_DYNSET_UNSPEC, + NFTA_DYNSET_SET_NAME, + NFTA_DYNSET_SET_ID, + NFTA_DYNSET_OP, + NFTA_DYNSET_SREG_KEY, + NFTA_DYNSET_SREG_DATA, + NFTA_DYNSET_TIMEOUT, + __NFTA_DYNSET_MAX, +}; +#define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) + /** * enum nft_payload_bases - nf_tables payload expression offset bases * diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 89f73a9e9874..a87d8b8ec730 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,7 +70,7 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 90b898491da7..598e53eb64b3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3183,11 +3183,11 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, return trans; } -static void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, - u64 timeout, gfp_t gfp) +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const struct nft_data *key, + const struct nft_data *data, + u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; void *elem; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index ef4dfcbaf149..7caf08a9225d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -239,8 +239,14 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err6; + err = nft_dynset_module_init(); + if (err < 0) + goto err7; + return 0; +err7: + nft_payload_module_exit(); err6: nft_byteorder_module_exit(); err5: @@ -257,6 +263,7 @@ err1: void nf_tables_core_module_exit(void) { + nft_dynset_module_exit(); nft_payload_module_exit(); nft_byteorder_module_exit(); nft_bitwise_module_exit(); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c new file mode 100644 index 000000000000..eeb72dee78ef --- /dev/null +++ b/net/netfilter/nft_dynset.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2015 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dynset { + struct nft_set *set; + struct nft_set_ext_tmpl tmpl; + enum nft_dynset_ops op:8; + enum nft_registers sreg_key:8; + enum nft_registers sreg_data:8; + u64 timeout; + struct nft_set_binding binding; +}; + +static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1]) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + u64 timeout; + void *elem; + + if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) + return NULL; + + timeout = priv->timeout ? : set->timeout; + elem = nft_set_elem_init(set, &priv->tmpl, + &data[priv->sreg_key], &data[priv->sreg_data], + timeout, GFP_ATOMIC); + if (elem == NULL) { + if (set->size) + atomic_dec(&set->nelems); + } + return elem; +} + +static void nft_dynset_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set = priv->set; + const struct nft_set_ext *ext; + u64 timeout; + + if (set->ops->update(set, &data[priv->sreg_key], nft_dynset_new, + expr, data, &ext)) { + if (priv->op == NFT_DYNSET_OP_UPDATE && + nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + timeout = priv->timeout ? : set->timeout; + *nft_set_ext_expiration(ext) = jiffies + timeout; + return; + } + } + + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { + [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, + [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, + [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, +}; + +static int nft_dynset_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set; + u64 timeout; + int err; + + if (tb[NFTA_DYNSET_SET_NAME] == NULL || + tb[NFTA_DYNSET_OP] == NULL || + tb[NFTA_DYNSET_SREG_KEY] == NULL) + return -EINVAL; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + if (IS_ERR(set)) { + if (tb[NFTA_DYNSET_SET_ID]) + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_DYNSET_SET_ID]); + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); + switch (priv->op) { + case NFT_DYNSET_OP_ADD: + break; + case NFT_DYNSET_OP_UPDATE: + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + break; + default: + return -EOPNOTSUPP; + } + + timeout = 0; + if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(tb[NFTA_DYNSET_TIMEOUT])); + } + + priv->sreg_key = ntohl(nla_get_be32(tb[NFTA_DYNSET_SREG_KEY])); + err = nft_validate_input_register(priv->sreg_key); + if (err < 0) + return err; + + if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { + if (!(set->flags & NFT_SET_MAP)) + return -EINVAL; + if (set->dtype == NFT_DATA_VERDICT) + return -EOPNOTSUPP; + + priv->sreg_data = ntohl(nla_get_be32(tb[NFTA_DYNSET_SREG_DATA])); + err = nft_validate_input_register(priv->sreg_data); + if (err < 0) + return err; + } else if (set->flags & NFT_SET_MAP) + return -EINVAL; + + nft_set_ext_prepare(&priv->tmpl); + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); + if (set->flags & NFT_SET_MAP) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); + if (set->flags & NFT_SET_TIMEOUT) { + if (timeout || set->timeout) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); + } + + priv->timeout = timeout; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static void nft_dynset_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_DYNSET_SREG_KEY, htonl(priv->sreg_key))) + goto nla_put_failure; + if (priv->set->flags & NFT_SET_MAP && + nla_put_be32(skb, NFTA_DYNSET_SREG_DATA, htonl(priv->sreg_data))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_OP, htonl(priv->op))) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dynset_type; +static const struct nft_expr_ops nft_dynset_ops = { + .type = &nft_dynset_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)), + .eval = nft_dynset_eval, + .init = nft_dynset_init, + .destroy = nft_dynset_destroy, + .dump = nft_dynset_dump, +}; + +static struct nft_expr_type nft_dynset_type __read_mostly = { + .name = "dynset", + .ops = &nft_dynset_ops, + .policy = nft_dynset_policy, + .maxattr = NFTA_DYNSET_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_dynset_module_init(void) +{ + return nft_register_expr(&nft_dynset_type); +} + +void nft_dynset_module_exit(void) +{ + nft_unregister_expr(&nft_dynset_type); +} diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c74e2bf1a1e4..bc23806b7fbe 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -90,6 +90,42 @@ static bool nft_hash_lookup(const struct nft_set *set, return !!he; } +static bool nft_hash_update(struct nft_set *set, const struct nft_data *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_data []), + const struct nft_expr *expr, + struct nft_data data[], + const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + goto out; + + he = new(set, expr, data); + if (he == NULL) + goto err1; + if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params)) + goto err2; +out: + *ext = &he->ext; + return true; + +err2: + nft_set_elem_destroy(set, he); +err1: + return false; +} + static int nft_hash_insert(const struct nft_set *set, const struct nft_set_elem *elem) { @@ -335,6 +371,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .deactivate = nft_hash_deactivate, .remove = nft_hash_remove, .lookup = nft_hash_lookup, + .update = nft_hash_update, .walk = nft_hash_walk, .features = NFT_SET_MAP | NFT_SET_TIMEOUT, .owner = THIS_MODULE, -- cgit v1.2.3 From 68e942e88add0ac8576fc8397e86495edf3dcea7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 5 Apr 2015 14:43:38 +0200 Subject: netfilter: nf_tables: support optional userdata for set elements Add an userdata set extension and allow the user to attach arbitrary data to set elements. This is intended to hold TLV encoded data like comments or DNS annotations that have no meaning to the kernel. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 38c3496f7bf2..63c44bdfdd3b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -350,6 +350,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time + * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -358,6 +359,7 @@ enum nft_set_extensions { NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, + NFT_SET_EXT_USERDATA, NFT_SET_EXT_NUM }; @@ -464,6 +466,11 @@ static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ex return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } +static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_USERDATA); +} + static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0b87b2f67fe3..05ee1e0804a3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -292,6 +292,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32) * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) + * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -300,6 +301,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_FLAGS, NFTA_SET_ELEM_TIMEOUT, NFTA_SET_ELEM_EXPIRATION, + NFTA_SET_ELEM_USERDATA, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 598e53eb64b3..0b96fa0d64b2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2872,6 +2872,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [NFT_SET_EXT_USERDATA] = { + .len = sizeof(struct nft_userdata), + .align = __alignof__(struct nft_userdata), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -2884,6 +2888,8 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -2964,6 +2970,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; } + if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { + struct nft_userdata *udata; + + udata = nft_set_ext_userdata(ext); + if (nla_put(skb, NFTA_SET_ELEM_USERDATA, + udata->len + 1, udata->data)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); return 0; @@ -3232,11 +3247,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_userdata *udata; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; u64 timeout; u32 flags; + u8 ulen; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3325,6 +3342,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_DATA); } + /* The full maximum length of userdata can exceed the maximum + * offset value (U8_MAX) for following extensions, therefor it + * must be the last extension added. + */ + ulen = 0; + if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { + ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); + if (ulen > 0) + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, + ulen); + } + err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, timeout, GFP_KERNEL); @@ -3334,6 +3363,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; + if (ulen > 0) { + udata = nft_set_ext_userdata(ext); + udata->len = ulen - 1; + nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) -- cgit v1.2.3 From 01a3d796813d6302af9f828f34b73d21a4b96c9a Mon Sep 17 00:00:00 2001 From: Vlad Zolotarov Date: Mon, 30 Mar 2015 21:35:23 +0300 Subject: if_link: Add an additional parameter to ifla_vf_info for RSS querying Add configuration setting for drivers to allow/block an RSS Redirection Table and a Hash Key querying for discrete VFs. On some devices VF share the mentioned above information with PF and querying it may adduce a theoretical security risk. We want to let a system administrator to decide if he/she wants to take this risk or not. Signed-off-by: Vlad Zolotarov Tested-by: Phil Schmitt Signed-off-by: Jeff Kirsher --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 8 ++++++++ include/uapi/linux/if_link.h | 8 ++++++++ net/core/rtnetlink.c | 32 ++++++++++++++++++++++++++------ 4 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 119130e9298b..da4929927f69 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -14,5 +14,6 @@ struct ifla_vf_info { __u32 linkstate; __u32 min_tx_rate; __u32 max_tx_rate; + __u32 rss_query_en; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf6d9df34d7b..13acb3d8ecdd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -878,6 +878,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); + * + * Enable or disable the VF ability to query its RSS Redirection Table and + * Hash Key. This is needed since on some devices VF share this information + * with PF and querying it may adduce a theoretical security risk. + * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); * int (*ndo_setup_tc)(struct net_device *dev, u8 tc) * Called to setup 'tc' number of traffic classes in the net device. This @@ -1099,6 +1104,9 @@ struct net_device_ops { struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + int (*ndo_set_vf_rss_query_en)( + struct net_device *dev, + int vf, bool setting); int (*ndo_setup_tc)(struct net_device *dev, u8 tc); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7ffb18df01ca..d9cd19214b98 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -465,6 +465,9 @@ enum { IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ __IFLA_VF_MAX, }; @@ -509,6 +512,11 @@ struct ifla_vf_link_state { __u32 link_state; }; +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + /* VF ports management section * * Nested layout of set/get msg is: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7a836152359b..358d52a38533 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -818,7 +818,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + - nla_total_size(sizeof(struct ifla_vf_link_state))); + nla_total_size(sizeof(struct ifla_vf_link_state)) + + nla_total_size(sizeof(struct ifla_vf_rss_query_en))); return size; } else return 0; @@ -1132,14 +1133,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_rss_query_en vf_rss_query_en; /* * Not all SR-IOV capable drivers support the - * spoofcheck query. Preset to -1 so the user - * space tool can detect that the driver didn't - * report anything. + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. */ ivi.spoofchk = -1; + ivi.rss_query_en = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero @@ -1152,7 +1155,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = - vf_linkstate.vf = ivi.vf; + vf_linkstate.vf = + vf_rss_query_en.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; @@ -1162,6 +1166,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -1176,7 +1181,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate)) + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en)) goto nla_put_failure; nla_nest_end(skb, vf); } @@ -1290,6 +1298,7 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, + [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1500,6 +1509,17 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivl->link_state); break; } + case IFLA_VF_RSS_QUERY_EN: { + struct ifla_vf_rss_query_en *ivrssq_en; + + ivrssq_en = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rss_query_en) + err = ops->ndo_set_vf_rss_query_en(dev, + ivrssq_en->vf, + ivrssq_en->setting); + break; + } default: err = -EINVAL; break; -- cgit v1.2.3 From 7a6c8c34e5b71ac50e39588e20b39494a9e1d8e5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 10 Apr 2015 12:00:30 -0700 Subject: fou: implement FOU_CMD_GET Cc: Tom Herbert Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 1 + net/ipv4/fou.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index c303588bb767..d2947c52dc67 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -25,6 +25,7 @@ enum { FOU_CMD_UNSPEC, FOU_CMD_ADD, FOU_CMD_DEL, + FOU_CMD_GET, __FOU_CMD_MAX, }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index c244b1a65787..263710259774 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -21,6 +21,7 @@ struct fou { u8 protocol; u8 flags; __be16 port; + u16 type; struct udp_offload udp_offloads; struct list_head list; }; @@ -487,6 +488,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, goto error; } + fou->type = cfg->type; + udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -617,6 +620,106 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) return fou_destroy(net, &cfg); } +static int fou_fill_info(struct fou *fou, struct sk_buff *msg) +{ + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || + nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + return -1; + + if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) + if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) + return -1; + return 0; +} + +static int fou_dump_info(struct fou *fou, u32 portid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (fou_fill_info(fou, skb) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct fou_net *fn = net_generic(net, fou_net_id); + struct sk_buff *msg; + struct fou_cfg cfg; + struct fou *fout; + __be16 port; + int ret; + + ret = parse_nl_config(info, &cfg); + if (ret) + return ret; + port = cfg.udp_config.local_udp_port; + if (port == 0) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = -ESRCH; + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (port == fout->port) { + ret = fou_dump_info(fout, info->snd_portid, + info->snd_seq, 0, msg, + info->genlhdr->cmd); + break; + } + } + mutex_unlock(&fn->fou_lock); + if (ret < 0) + goto out_free; + + return genlmsg_reply(msg, info); + +out_free: + nlmsg_free(msg); + return ret; +} + +static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct fou_net *fn = net_generic(net, fou_net_id); + struct fou *fout; + int idx = 0, ret; + + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (idx++ < cb->args[0]) + continue; + ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, FOU_CMD_GET); + if (ret) + goto done; + } + mutex_unlock(&fn->fou_lock); + +done: + cb->args[0] = idx; + return skb->len; +} + static const struct genl_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, @@ -630,6 +733,12 @@ static const struct genl_ops fou_nl_ops[] = { .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = FOU_CMD_GET, + .doit = fou_nl_cmd_get_port, + .dumpit = fou_nl_dump, + .policy = fou_nl_policy, + }, }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) -- cgit v1.2.3 From 49499c3e6e18b7677a63316f3ff54a16533dc28f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 02:27:37 +0100 Subject: netfilter: nf_tables: switch registers to 32 bit addressing Switch the nf_tables registers from 128 bit addressing to 32 bit addressing to support so called concatenations, where multiple values can be concatenated over multiple registers for O(1) exact matches of multiple dimensions using sets. The old register values are mapped to areas of 128 bits for compatibility. When dumping register numbers, values are expressed using the old values if they refer to the beginning of a 128 bit area for compatibility. To support concatenations, register loads of less than a full 32 bit value need to be padded. This mainly affects the payload and exthdr expressions, which both unconditionally zero the last word before copying the data. Userspace fully passes the testsuite using both old and new register addressing. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 +++----- include/uapi/linux/netfilter/nf_tables.h | 31 +++++++++++++++++- net/bridge/netfilter/nft_meta_bridge.c | 2 +- net/ipv4/netfilter/nft_redir_ipv4.c | 4 +-- net/ipv6/netfilter/nft_redir_ipv6.c | 4 +-- net/netfilter/nf_tables_api.c | 54 +++++++++++++++++++++++++------- net/netfilter/nf_tables_core.c | 5 +-- net/netfilter/nft_bitwise.c | 4 +-- net/netfilter/nft_byteorder.c | 4 +-- net/netfilter/nft_ct.c | 4 +-- net/netfilter/nft_exthdr.c | 3 +- net/netfilter/nft_immediate.c | 2 +- net/netfilter/nft_lookup.c | 2 +- net/netfilter/nft_meta.c | 7 +++-- net/netfilter/nft_nat.c | 16 +++++----- net/netfilter/nft_payload.c | 3 +- 16 files changed, 110 insertions(+), 48 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f8f27a48bbe9..1f9b848c778c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -64,17 +64,15 @@ struct nft_data { */ struct nft_regs { union { - struct nft_data data[NFT_REG_MAX + 1]; + u32 data[20]; struct nft_verdict verdict; }; }; -static inline void nft_data_copy(struct nft_data *dst, - const struct nft_data *src) +static inline void nft_data_copy(u32 *dst, const struct nft_data *src, + unsigned int len) { - BUILD_BUG_ON(__alignof__(*dst) != __alignof__(u64)); - *(u64 *)&dst->data[0] = *(u64 *)&src->data[0]; - *(u64 *)&dst->data[2] = *(u64 *)&src->data[2]; + memcpy(dst, src, len); } static inline void nft_data_debug(const struct nft_data *data) @@ -502,8 +500,7 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, + const u32 *key, const u32 *data, u64 timeout, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05ee1e0804a3..4221a6c3a8a5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -5,16 +5,45 @@ #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 +/** + * enum nft_registers - nf_tables registers + * + * nf_tables used to have five registers: a verdict register and four data + * registers of size 16. The data registers have been changed to 16 registers + * of size 4. For compatibility reasons, the NFT_REG_[1-4] registers still + * map to areas of size 16, the 4 byte registers are addressed using + * NFT_REG32_00 - NFT_REG32_15. + */ enum nft_registers { NFT_REG_VERDICT, NFT_REG_1, NFT_REG_2, NFT_REG_3, NFT_REG_4, - __NFT_REG_MAX + __NFT_REG_MAX, + + NFT_REG32_00 = 8, + MFT_REG32_01, + NFT_REG32_02, + NFT_REG32_03, + NFT_REG32_04, + NFT_REG32_05, + NFT_REG32_06, + NFT_REG32_07, + NFT_REG32_08, + NFT_REG32_09, + NFT_REG32_10, + NFT_REG32_11, + NFT_REG32_12, + NFT_REG32_13, + NFT_REG32_14, + NFT_REG32_15, }; #define NFT_REG_MAX (__NFT_REG_MAX - 1) +#define NFT_REG_SIZE 16 +#define NFT_REG32_SIZE 4 + /** * enum nft_verdicts - nf_tables internal verdicts * diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 99dab70ecae0..a21269b83f16 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -24,7 +24,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, { const struct nft_meta *priv = nft_expr_priv(expr); const struct net_device *in = pkt->in, *out = pkt->out; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; const struct net_bridge_port *p; switch (priv->key) { diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 312cf6f3b6dc..d8d795df9c13 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -27,9 +27,9 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { mr.range[0].min.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; mr.range[0].max.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index 0eed774815cf..effd393bd517 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -27,9 +27,9 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min], range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max], range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a25fd19453e7..03faf76ce3b8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3201,8 +3201,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, + const u32 *key, const u32 *data, u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; @@ -3357,7 +3356,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.data, data.data, timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4122,14 +4121,47 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, return 0; } +/** + * nft_parse_register - parse a register value from a netlink attribute + * + * @attr: netlink attribute + * + * Parse and translate a register value from a netlink attribute. + * Registers used to be 128 bit wide, these register numbers will be + * mapped to the corresponding 32 bit register numbers. + */ unsigned int nft_parse_register(const struct nlattr *attr) { - return ntohl(nla_get_be32(attr)); + unsigned int reg; + + reg = ntohl(nla_get_be32(attr)); + switch (reg) { + case NFT_REG_VERDICT...NFT_REG_4: + return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + default: + return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + } } EXPORT_SYMBOL_GPL(nft_parse_register); +/** + * nft_dump_register - dump a register value to a netlink attribute + * + * @skb: socket buffer + * @attr: attribute number + * @reg: register number + * + * Construct a netlink attribute containing the register number. For + * compatibility reasons, register numbers being a multiple of 4 are + * translated to the corresponding 128 bit register numbers. + */ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) { + if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0) + reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE); + else + reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00; + return nla_put_be32(skb, attr, htonl(reg)); } EXPORT_SYMBOL_GPL(nft_dump_register); @@ -4145,14 +4177,13 @@ EXPORT_SYMBOL_GPL(nft_dump_register); */ int nft_validate_register_load(enum nft_registers reg, unsigned int len) { - if (reg <= NFT_REG_VERDICT) + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; - if (reg > NFT_REG_MAX) - return -ERANGE; if (len == 0) return -EINVAL; - if (len > FIELD_SIZEOF(struct nft_data, data)) + if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data)) return -ERANGE; + return 0; } EXPORT_SYMBOL_GPL(nft_validate_register_load); @@ -4200,13 +4231,12 @@ int nft_validate_register_store(const struct nft_ctx *ctx, return 0; default: - if (reg < NFT_REG_1) + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; - if (reg > NFT_REG_MAX) - return -ERANGE; if (len == 0) return -EINVAL; - if (len > FIELD_SIZEOF(struct nft_data, data)) + if (reg * NFT_REG32_SIZE + len > + FIELD_SIZEOF(struct nft_regs, data)) return -ERANGE; if (data != NULL && type != NFT_DATA_VALUE) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 5ef07d17b358..f153b07073af 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -70,7 +70,7 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); u32 mask = nft_cmp_fast_mask(priv->len); - if ((regs->data[priv->sreg].data[0] & mask) == priv->data) + if ((regs->data[priv->sreg] & mask) == priv->data) return; regs->verdict.code = NFT_BREAK; } @@ -81,7 +81,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; unsigned char *ptr; if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) @@ -94,6 +94,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) return false; + *dest = 0; if (priv->len == 2) *(u16 *)dest = *(u16 *)ptr; else if (priv->len == 4) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index aa1147032ace..f1a9be2aecd1 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -30,8 +30,8 @@ static void nft_bitwise_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg].data[0]; - u32 *dst = ®s->data[priv->dreg].data[0]; + const u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; unsigned int i; for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 2ee3e57ad814..fde5145f2e36 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -30,8 +30,8 @@ static void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); - u32 *src = ®s->data[priv->sreg].data[0]; - u32 *dst = ®s->data[priv->dreg].data[0]; + u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; union { u32 u32; u16 u16; } *s, *d; unsigned int i; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index fab8e754b18a..8cbca3432f90 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -35,7 +35,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; @@ -156,7 +156,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #ifdef CONFIG_NF_CONNTRACK_MARK - u32 value = regs->data[priv->sreg].data[0]; + u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 098ffee793d7..ba7aed13e174 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -30,7 +30,7 @@ static void nft_exthdr_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; unsigned int offset = 0; int err; @@ -39,6 +39,7 @@ static void nft_exthdr_eval(const struct nft_expr *expr, goto err; offset += priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) goto err; return; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 0682f600c7a5..1e8e412eadae 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,7 +29,7 @@ static void nft_immediate_eval(const struct nft_expr *expr, { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - nft_data_copy(®s->data[priv->dreg], &priv->data); + nft_data_copy(®s->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index fc7afff81566..ba1466209f2a 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -36,7 +36,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, if (set->ops->lookup(set, ®s->data[priv->sreg], &ext)) { if (set->flags & NFT_SET_MAP) nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext)); + nft_set_ext_data(ext), set->dlen); return; } regs->verdict.code = NFT_BREAK; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 5f744eb61de5..52561e1c31e2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -31,13 +31,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { case NFT_META_LEN: *dest = skb->len; break; case NFT_META_PROTOCOL: + *dest = 0; *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: @@ -75,11 +76,13 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_IIFTYPE: if (in == NULL) goto err; + *dest = 0; *(u16 *)dest = in->type; break; case NFT_META_OIFTYPE: if (out == NULL) goto err; + *dest = 0; *(u16 *)dest = out->type; break; case NFT_META_SKUID: @@ -185,7 +188,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = regs->data[meta->sreg].data[0]; + u32 value = regs->data[meta->sreg]; switch (meta->key) { case NFT_META_MARK: diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 065cbda63b0a..ee2d71753746 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -49,26 +49,26 @@ static void nft_nat_eval(const struct nft_expr *expr, if (priv->sreg_addr_min) { if (priv->family == AF_INET) { range.min_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_min].data[0]; + regs->data[priv->sreg_addr_min]; range.max_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_max].data[0]; + regs->data[priv->sreg_addr_max]; } else { memcpy(range.min_addr.ip6, - ®s->data[priv->sreg_addr_min].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_min], + sizeof(range.min_addr.ip6)); memcpy(range.max_addr.ip6, - ®s->data[priv->sreg_addr_max].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_max], + sizeof(range.max_addr.ip6)); } range.flags |= NF_NAT_RANGE_MAP_IPS; } if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 5fa997346a23..94fb3b27a2c5 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,7 +23,7 @@ static void nft_payload_eval(const struct nft_expr *expr, { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; int offset; switch (priv->base) { @@ -43,6 +43,7 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; -- cgit v1.2.3 From 7d7402642eaf385aef0772eff5a35e34fc4995d7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 02:27:39 +0100 Subject: netfilter: nf_tables: variable sized set element keys / data This patch changes sets to support variable sized set element keys / data up to 64 bytes each by using variable sized set extensions. This allows to use concatenations with bigger data items suchs as IPv6 addresses. As a side effect, small keys/data now don't require the full 16 bytes of struct nft_data anymore but just the space they need. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 ++++- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 27 ++++++++++++--------------- net/netfilter/nft_hash.c | 4 ++-- net/netfilter/nft_rbtree.c | 3 ++- 5 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 160577bf0f0a..cb42da1011ef 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -158,7 +158,10 @@ struct nft_userdata { * @priv: element private data and extensions */ struct nft_set_elem { - struct nft_data key; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } key; void *priv; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4221a6c3a8a5..be8584c95297 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -388,6 +388,9 @@ enum nft_data_attributes { }; #define NFTA_DATA_MAX (__NFTA_DATA_MAX - 1) +/* Maximum length of a value */ +#define NFT_DATA_VALUE_MAXLEN 64 + /** * enum nft_verdict_attributes - nf_tables verdict netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2b3f88f4c70f..ed0e70ea2bc5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2608,7 +2608,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; flags = 0; @@ -2634,11 +2634,10 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (desc.dlen == 0 || - desc.dlen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; } else - desc.dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_verdict); } else if (flags & NFT_SET_MAP) return -EINVAL; @@ -2854,12 +2853,10 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { - .len = sizeof(struct nft_data), - .align = __alignof__(struct nft_data), + .align = __alignof__(u32), }, [NFT_SET_EXT_DATA] = { - .len = sizeof(struct nft_data), - .align = __alignof__(struct nft_data), + .align = __alignof__(u32), }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), @@ -3299,7 +3296,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } - err = nft_data_init(ctx, &elem.key, sizeof(elem.key), &d1, + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3307,7 +3304,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) goto err2; - nft_set_ext_add(&tmpl, NFT_SET_EXT_KEY); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -3342,7 +3339,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err3; } - nft_set_ext_add(&tmpl, NFT_SET_EXT_DATA); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); } /* The full maximum length of userdata can exceed the maximum @@ -3358,7 +3355,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -3393,7 +3390,7 @@ err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_uninit(&data, d2.type); err2: - nft_data_uninit(&elem.key, d1.type); + nft_data_uninit(&elem.key.val, d1.type); err1: return err; } @@ -3460,7 +3457,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; - err = nft_data_init(ctx, &elem.key, sizeof(elem.key), &desc, + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3488,7 +3485,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err3: kfree(trans); err2: - nft_data_uninit(&elem.key, desc.type); + nft_data_uninit(&elem.key.val, desc.type); err1: return err; } diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 767df41d28ea..3f9d45d3d9b7 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -133,7 +133,7 @@ static int nft_hash_insert(const struct nft_set *set, struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(read_pnet(&set->pnet)), .set = set, - .key = elem->key.data, + .key = elem->key.val.data, }; return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, @@ -157,7 +157,7 @@ static void *nft_hash_deactivate(const struct nft_set *set, struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(read_pnet(&set->pnet)), .set = set, - .key = elem->key.data, + .key = elem->key.val.data, }; rcu_read_lock(); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index b888e0cdf1e2..1c30f41cff5b 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -152,7 +152,8 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, + set->klen); if (d < 0) parent = parent->rb_left; else if (d > 0) -- cgit v1.2.3 From 24477e57412a7a7dea62637ac990bc5c1cff0665 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Apr 2015 19:41:40 +0200 Subject: uapi: ebtables: don't include linux/if.h linux/if.h creates conflicts in userspace with net/if.h By using it here we force userspace to use linux/if.h while net/if.h may be needed. Note that: include/linux/netfilter_ipv4/ip_tables.h and include/linux/netfilter_ipv6/ip6_tables.h don't include linux/if.h and they also refer to IFNAMSIZ, so they are expecting userspace to include use net/if.h from the client program. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 3 ++- include/uapi/linux/netfilter_bridge/ebtables.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 34e7a2b7f867..f1bd3962e6b6 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -12,9 +12,10 @@ #ifndef __LINUX_BRIDGE_EFF_H #define __LINUX_BRIDGE_EFF_H +#include +#include #include - /* return values for match() functions */ #define EBT_MATCH 0 #define EBT_NOMATCH 1 diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index ba993360dbe9..773dfe8924c7 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -12,9 +12,7 @@ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H -#include #include -#include #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN -- cgit v1.2.3 From f25ad2e907f110378159fe5e088aa13176faaa5b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:39 +0100 Subject: netfilter: nf_tables: prepare for expressions associated to set elements Preparation to attach expressions to set elements: add a set extension type to hold an expression and dump the expression information with the set element. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 9 +++++++++ 3 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e21623cb7b20..d45a871b3da6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -371,6 +371,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element + * @NFT_SET_EXT_EXPR: expression assiociated with the element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -380,6 +381,7 @@ enum nft_set_extensions { NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, + NFT_SET_EXT_EXPR, NFT_SET_EXT_NUM }; @@ -491,6 +493,11 @@ static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } +static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_EXPR); +} + static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index be8584c95297..f9c5af22a6af 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -322,6 +322,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) + * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -331,6 +332,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_TIMEOUT, NFTA_SET_ELEM_EXPIRATION, NFTA_SET_ELEM_USERDATA, + NFTA_SET_ELEM_EXPR, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e97bee59fe08..8830811550ec 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2904,6 +2904,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, + [NFT_SET_EXT_EXPR] = { + .align = __alignof__(struct nft_expr), + }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), @@ -2990,6 +2993,10 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, set->dlen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && + nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) @@ -3276,6 +3283,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_uninit(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); kfree(elem); } -- cgit v1.2.3 From 7c6c6e95a12e46f499749bdd496e53d03950f377 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:41 +0100 Subject: netfilter: nf_tables: add flag to indicate set contains expressions Add a set flag to indicate that the set is used as a state table and contains expressions for evaluation. This operation is mutually exclusive with the mapping operation, so sets specifying both are rejected. The lookup expression also rejects binding to state tables since it only deals with loopup and map operations. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 8 ++++++-- net/netfilter/nft_lookup.c | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f9c5af22a6af..48942381d02f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -238,6 +238,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary * @NFT_SET_TIMEOUT: set uses timeouts + * @NFT_SET_EVAL: set contains expressions for evaluation */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, @@ -245,6 +246,7 @@ enum nft_set_flags { NFT_SET_INTERVAL = 0x4, NFT_SET_MAP = 0x8, NFT_SET_TIMEOUT = 0x10, + NFT_SET_EVAL = 0x20, }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8830811550ec..78af83bc9c8e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2661,9 +2661,13 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | - NFT_SET_INTERVAL | NFT_SET_MAP | - NFT_SET_TIMEOUT)) + NFT_SET_INTERVAL | NFT_SET_TIMEOUT | + NFT_SET_MAP | NFT_SET_EVAL)) return -EINVAL; + /* Only one of both operations is supported */ + if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == + (NFT_SET_MAP | NFT_SET_EVAL)) + return -EOPNOTSUPP; } dtype = 0; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index ba1466209f2a..b3c31ef8015d 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -71,6 +71,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return PTR_ERR(set); } + if (set->flags & NFT_SET_EVAL) + return -EOPNOTSUPP; + priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) -- cgit v1.2.3 From 3e135cd499bfbec15684fe9c756162d58df4dc77 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:42 +0100 Subject: netfilter: nft_dynset: dynamic stateful expression instantiation Support instantiating stateful expressions based on a template that are associated with dynamically created set entries. The expressions are evaluated when adding or updating the set element. This allows to maintain per flow state using the existing set infrastructure and expression types, with arbitrary definitions of a flow. Usage is currently restricted to anonymous sets, meaning only a single binding can exist, since the desired semantics of multiple independant bindings haven't been defined so far. Examples (userspace syntax is still WIP): 1. Limit the rate of new SSH connections per host, similar to iptables hashlimit: flow ip saddr timeout 60s \ limit 10/second \ accept 2. Account network traffic between each set of /24 networks: flow ip saddr & 255.255.255.0 . ip daddr & 255.255.255.0 \ counter 3. Account traffic to each host per user: flow skuid . ip daddr \ counter 4. Account traffic for each combination of source address and TCP flags: flow ip saddr . tcp flags \ counter The resulting set content after a Xmas-scan look like this: { 192.168.122.1 . fin | psh | urg : counter packets 1001 bytes 40040, 192.168.122.1 . ack : counter packets 74 bytes 3848, 192.168.122.1 . psh | ack : counter packets 35 bytes 3144 } Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_dynset.c | 54 +++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 48942381d02f..5fa1cd04762e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -567,6 +567,7 @@ enum nft_dynset_ops { * @NFTA_DYNSET_SREG_KEY: source register of the key (NLA_U32) * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) + * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) */ enum nft_dynset_attributes { NFTA_DYNSET_UNSPEC, @@ -576,6 +577,7 @@ enum nft_dynset_attributes { NFTA_DYNSET_SREG_KEY, NFTA_DYNSET_SREG_DATA, NFTA_DYNSET_TIMEOUT, + NFTA_DYNSET_EXPR, __NFTA_DYNSET_MAX, }; #define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 03699d5c0b4b..513a8ef60a59 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -23,6 +23,7 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; u64 timeout; + struct nft_expr *expr; struct nft_set_binding binding; }; @@ -30,6 +31,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set_ext *ext; u64 timeout; void *elem; @@ -44,7 +46,13 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, if (elem == NULL) { if (set->size) atomic_dec(&set->nelems); + return NULL; } + + ext = nft_set_elem_ext(set, elem); + if (priv->expr != NULL) + nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + return elem; } @@ -55,18 +63,27 @@ static void nft_dynset_eval(const struct nft_expr *expr, const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; const struct nft_set_ext *ext; + const struct nft_expr *sexpr; u64 timeout; if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { + sexpr = NULL; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + sexpr = nft_set_ext_expr(ext); + if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - return; - } - } + } else if (sexpr == NULL) + goto out; + if (sexpr != NULL) + sexpr->ops->eval(sexpr, regs, pkt); + return; + } +out: regs->verdict.code = NFT_BREAK; } @@ -77,6 +94,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -142,10 +160,29 @@ static int nft_dynset_init(const struct nft_ctx *ctx, } else if (set->flags & NFT_SET_MAP) return -EINVAL; + if (tb[NFTA_DYNSET_EXPR] != NULL) { + if (!(set->flags & NFT_SET_EVAL)) + return -EINVAL; + if (!(set->flags & NFT_SET_ANONYMOUS)) + return -EOPNOTSUPP; + + priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); + if (IS_ERR(priv->expr)) + return PTR_ERR(priv->expr); + + err = -EOPNOTSUPP; + if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) + goto err1; + } else if (set->flags & NFT_SET_EVAL) + return -EINVAL; + nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); + if (priv->expr != NULL) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, + priv->expr->ops->size); if (set->flags & NFT_SET_TIMEOUT) { if (timeout || set->timeout) nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); @@ -155,10 +192,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) - return err; + goto err1; priv->set = set; return 0; + +err1: + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); + return err; } static void nft_dynset_destroy(const struct nft_ctx *ctx, @@ -167,6 +209,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, struct nft_dynset *priv = nft_expr_priv(expr); nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); } static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -184,6 +228,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) goto nla_put_failure; + if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3