diff options
Diffstat (limited to 'net')
35 files changed, 1055 insertions, 501 deletions
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7a7fd672ccf2..9019f326fe81 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && - nbp_switchdev_allowed_egress(p, skb); + nbp_switchdev_allowed_egress(p, skb) && + !br_skb_isolated(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7f98a7d25866..72074276c088 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; + BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 015f465c514b..9f5eb05b0373 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, - !!(p->flags & BR_NEIGH_SUPPRESS))) + !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, + [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; + err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + if (err) + return err; + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 742f40aefdaf..11520ed528b0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -423,6 +423,7 @@ struct br_input_skb_cb { #endif bool proxyarp_replied; + bool src_port_isolated; #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; @@ -574,6 +575,14 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig); +/* return true if both source port and dest port are isolated */ +static inline bool br_skb_isolated(const struct net_bridge_port *to, + const struct sk_buff *skb) +{ + return BR_INPUT_SKB_CB(skb)->src_port_isolated && + (to->flags & BR_ISOLATED); +} + /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified); int br_add_bridge(struct net *net, const char *name); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index fd31ad83ec7b..f99c5bf5c906 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); +BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, + &brport_attr_isolated, NULL }; diff --git a/net/core/filter.c b/net/core/filter.c index 51ea7ddb2d8d..acf1f4fb99d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -64,6 +64,10 @@ #include <net/ip_fib.h> #include <net/flow.h> #include <net/arp.h> +#include <net/ipv6.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3042,7 +3046,7 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int err; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; @@ -3052,9 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) - return err; + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + if (sent <= 0) + return sent; dev->netdev_ops->ndo_xdp_flush(dev); return 0; } @@ -3068,20 +3072,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: { - struct net_device *dev = fwd; - struct xdp_frame *xdpf; + struct bpf_dtab_netdev *dst = fwd; - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - /* TODO: move to inside map code instead, for bulk support - * err = dev_map_enqueue(dev, xdp); - */ - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); @@ -3370,28 +3363,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_xdp_adjust_tail) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -4096,7 +4067,7 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in_device *in_dev; struct neighbour *neigh; @@ -4105,6 +4076,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct fib_nh *nh; struct flowi4 fl4; int err; + u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4156,6 +4128,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ @@ -4184,7 +4162,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; @@ -4195,6 +4173,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif; + u32 mtu; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -4257,6 +4236,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif, NULL, strict); + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + if (f6i->fib6_nh.nh_lwtstate) return 0; @@ -4289,12 +4274,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif } return 0; @@ -4313,20 +4298,34 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { + struct net *net = dev_net(skb->dev); + int index = 0; + if (plen < sizeof(*params)) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; #endif } - return -ENOTSUPP; + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { @@ -4339,6 +4338,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { + .func = bpf_lwt_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_tlvs, *srh_end, *ptr; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = 0; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + memcpy(skb->data + offset, from, len); + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int err; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + if (!srh_state->valid) { + if (unlikely((srh_state->hdrlen & 7) != 0)) + return -EBADMSG; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + return -EBADMSG; + + srh_state->valid = 1; + } + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + default: + return -EINVAL; + } +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_end, *srh_tlvs, *ptr; + struct ipv6_sr_hdr *srh; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + srh_state->hdrlen += len; + srh_state->valid = 0; + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail || + func == bpf_lwt_push_encap || + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action + ) + return true; + + return false; +} + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4523,33 +4780,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; - case BPF_FUNC_csum_diff: - return &bpf_csum_diff_proto; - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_proto; - case BPF_FUNC_get_route_realm: - return &bpf_get_route_realm_proto; - case BPF_FUNC_get_hash_recalc: - return &bpf_get_hash_recalc_proto; - case BPF_FUNC_perf_event_output: - return &bpf_skb_event_output_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_skb_under_cgroup: - return &bpf_skb_under_cgroup_proto; - default: - return bpf_base_func_proto(func_id); - } -} - -static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -4615,6 +4845,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -4645,7 +4913,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id, prog); + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); } } @@ -4753,7 +5036,6 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } - /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, @@ -5155,18 +5437,23 @@ static bool sk_msg_is_valid_access(int off, int size, switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; break; + default: + if (size != sizeof(__u32)) + return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; - if (size != sizeof(__u64)) - return false; return true; } @@ -5842,7 +6129,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -6159,6 +6447,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct sk_msg_md, data): @@ -6171,6 +6460,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; } return insn - insn_buf; @@ -6219,13 +6609,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { - .get_func_proto = lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -6240,6 +6640,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 380934580fa1..419af6dfe29f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -35,10 +35,6 @@ #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <trace/events/fib6.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); -#endif #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); diff --git a/net/core/xdp.c b/net/core/xdp.c index bf6758f74339..cb8c4e061a5a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,7 +308,13 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -static void xdp_return(void *data, struct xdp_mem_info *mem) +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (xa) - page_pool_put_page(xa->page_pool, page); + page_pool_put_page(xa->page_pool, page, napi_direct); else put_page(page); rcu_read_unlock(); @@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) void xdp_return_frame(struct xdp_frame *xdpf) { - xdp_return(xdpf->data, &xdpf->mem); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + void xdp_return_buff(struct xdp_buff *xdp) { - xdp_return(xdp->data, &xdp->rxq->mem); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 897ae92dff0f..045c43a27c12 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.fl4_dport = 0; } - trace_fib_validate_source(dev, &fl4); - if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3dcffd3ce98c..65c340f230ae 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; - trace_fib_table_lookup(tb->tb_id, flp); - pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); - if (!n) + if (!n) { + trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); @@ -1416,8 +1416,11 @@ backtrace: * nothing for us to do as we do not have any * further nodes to parse. */ - if (IS_TRIE(pn)) + if (IS_TRIE(pn)) { + trace_fib_table_lookup(tb->tb_id, flp, + NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif @@ -1459,6 +1462,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) @@ -1494,7 +1498,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup_nh(nh); + trace_fib_table_lookup(tb->tb_id, flp, nh, err); return err; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0e401dc4e1bd..45ad2585eb28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1352,6 +1352,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + */ + +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) +{ + struct fib_info *fi = res->fi; + struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; + struct net_device *dev = nh->nh_dev; + u32 mtu = 0; + + if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) + mtu = fi->fib_mtu; + + if (likely(!mtu)) { + struct fib_nh_exception *fnhe; + + fnhe = find_exception(nh, daddr); + if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) + mtu = fnhe->fnhe_pmtu; + } + + if (likely(!mtu)) + mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); + + return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); +} + static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 11e4e80cf7e9..0eff75525da1 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC If unsure, say N. +config IPV6_SEG6_BPF + def_bool y + depends on IPV6_SEG6_LWTUNNEL + depends on IPV6 = y + endif # IPV6 diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 2fe754fd4f5e..5cd0029d930e 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -161,12 +161,20 @@ eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, return f6i; } +static u32 +eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + return 0; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, .fib6_multipath_select = eafnosupport_fib6_multipath_select, + .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 50de8b0d4f70..9ed0eae91758 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -894,6 +894,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, .fib6_multipath_select = fib6_multipath_select, + .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 038d661d5ffc..22c4de2317d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -64,14 +64,19 @@ #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> -#include <trace/events/fib6.h> - #include <linux/uaccess.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif +static int ip6_rt_type_to_error(u8 fib6_type); + +#define CREATE_TRACE_POINTS +#include <trace/events/fib6.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#undef CREATE_TRACE_POINTS + enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, @@ -2604,6 +2609,54 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + * + * based on ip6_dst_mtu_forward and exception logic of + * rt6_find_cached_rt; called with rcu_read_lock + */ +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct in6_addr *src_key; + struct inet6_dev *idev; + u32 mtu = 0; + + if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { + mtu = f6i->fib6_pmtu; + if (mtu) + goto out; + } + + src_key = NULL; +#ifdef CONFIG_IPV6_SUBTREES + if (f6i->fib6_src.plen) + src_key = saddr; +#endif + + bucket = rcu_dereference(f6i->rt6i_exception_bucket); + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); + + if (likely(!mtu)) { + struct net_device *dev = fib6_info_nh_dev(f6i); + + mtu = IPV6_MIN_MTU; + idev = __in6_dev_get(dev); + if (idev && idev->cnf.mtu6 > mtu) + mtu = idev->cnf.mtu6; + } + + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); +out: + return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); +} + struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 45722327375a..cd6e4cab63f6 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1,8 +1,9 @@ /* * SR-IPv6 implementation * - * Author: + * Authors: * David Lebrun <david.lebrun@uclouvain.be> + * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com> * * * This program is free software; you can redistribute it and/or @@ -30,7 +31,9 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif +#include <net/seg6_local.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> struct seg6_local_lwt; @@ -41,6 +44,11 @@ struct seg6_action_desc { int static_headroom; }; +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -49,6 +57,7 @@ struct seg6_local_lwt { struct in6_addr nh6; int iif; int oif; + struct bpf_lwt_prog bpf; int headroom; struct seg6_action_desc *desc; @@ -140,8 +149,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -187,6 +196,7 @@ out: skb_dst_drop(skb); skb_dst_set(skb, dst); + return dst->error; } /* regular endpoint function */ @@ -200,7 +210,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -220,7 +230,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, &slwt->nh6, 0); + seg6_lookup_nexthop(skb, &slwt->nh6, 0); return dst_input(skb); @@ -239,7 +249,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -331,7 +341,7 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; - lookup_nexthop(skb, nhaddr, 0); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); drop: @@ -380,7 +390,7 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -406,7 +416,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -438,7 +448,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -447,6 +457,71 @@ drop: return err; } +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + +static int input_action_end_bpf(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state local_srh_state; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int ret; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + /* preempt_disable is needed to protect the per-CPU buffer srh_state, + * which is also accessed by the bpf_lwt_seg6_* helpers + */ + preempt_disable(); + srh_state->hdrlen = srh->hdrlen << 3; + srh_state->valid = 1; + + rcu_read_lock(); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); + rcu_read_unlock(); + + local_srh_state = *srh_state; + preempt_enable(); + + switch (ret) { + case BPF_OK: + case BPF_REDIRECT: + break; + case BPF_DROP: + goto drop; + default: + pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret); + goto drop; + } + + if (unlikely((local_srh_state.hdrlen & 7) != 0)) + goto drop; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + goto drop; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); + + if (!local_srh_state.valid && + unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + goto drop; + + if (ret != BPF_REDIRECT) + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, @@ -493,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = { .attrs = (1 << SEG6_LOCAL_SRH), .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), - } + }, + { + .action = SEG6_LOCAL_ACTION_END_BPF, + .attrs = (1 << SEG6_LOCAL_BPF), + .input = input_action_end_bpf, + }, + }; static struct seg6_action_desc *__get_action_desc(int action) @@ -538,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { .len = sizeof(struct in6_addr) }, [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, }; static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -715,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } +#define MAX_PROG_NAME 256 +static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = { + [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, }, + [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX, + attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) + return -EINVAL; + + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); + if (!slwt->bpf.name) + return -ENOMEM; + + fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]); + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); + if (IS_ERR(p)) { + kfree(slwt->bpf.name); + return PTR_ERR(p); + } + + slwt->bpf.prog = p; + return 0; +} + +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nest; + + if (!slwt->bpf.prog) + return 0; + + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id)) + return -EMSGSIZE; + + if (slwt->bpf.name && + nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (!a->bpf.name && !b->bpf.name) + return 0; + + if (!a->bpf.name || !b->bpf.name) + return 1; + + return strcmp(a->bpf.name, b->bpf.name); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); @@ -745,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, .put = put_nla_oif, .cmp = cmp_nla_oif }, + + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, + .put = put_nla_bpf, + .cmp = cmp_nla_bpf }, + }; static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -830,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt) struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); kfree(slwt->srh); + + if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) { + kfree(slwt->bpf.name); + bpf_prog_put(slwt->bpf.prog); + } + + return; } static int seg6_local_fill_encap(struct sk_buff *skb, @@ -882,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) if (attrs & (1 << SEG6_LOCAL_OIF)) nlsize += nla_total_size(4); + if (attrs & (1 << SEG6_LOCAL_BPF)) + nlsize += nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + + nla_total_size(4); + return nlsize; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 963e4bf0aab8..a4a5ace834c3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) } EXPORT_SYMBOL(unregister_tcf_proto_ops); -bool tcf_queue_work(struct work_struct *work) +bool tcf_queue_work(struct rcu_work *rwork, work_func_t func) { - return queue_work(tc_filter_wq, work); + INIT_RCU_WORK(rwork, func); + return queue_rcu_work(tc_filter_wq, rwork); } EXPORT_SYMBOL(tcf_queue_work); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 6b7ab3512f5b..95367f37098d 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -35,10 +35,7 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f) static void basic_delete_filter_work(struct work_struct *work) { - struct basic_filter *f = container_of(work, struct basic_filter, work); - + struct basic_filter *f = container_of(to_rcu_work(work), + struct basic_filter, + rwork); rtnl_lock(); __basic_delete_filter(f); rtnl_unlock(); } -static void basic_delete_filter(struct rcu_head *head) -{ - struct basic_filter *f = container_of(head, struct basic_filter, rcu); - - INIT_WORK(&f->work, basic_delete_filter_work); - tcf_queue_work(&f->work); -} - static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); @@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, basic_delete_filter); + tcf_queue_work(&f->rwork, basic_delete_filter_work); else __basic_delete_filter(f); } @@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, basic_delete_filter); + tcf_queue_work(&f->rwork, basic_delete_filter_work); *last = list_empty(&head->flist); return 0; } @@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, basic_delete_filter); + tcf_queue_work(&fold->rwork, basic_delete_filter_work); } else { list_add_rcu(&fnew->link, &head->flist); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index b07c1fa8bc0d..1aa7f6511065 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -49,10 +49,7 @@ struct cls_bpf_prog { struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { @@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) static void cls_bpf_delete_prog_work(struct work_struct *work) { - struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work); - + struct cls_bpf_prog *prog = container_of(to_rcu_work(work), + struct cls_bpf_prog, + rwork); rtnl_lock(); __cls_bpf_delete_prog(prog); rtnl_unlock(); } -static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) -{ - struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); - - INIT_WORK(&prog->work, cls_bpf_delete_prog_work); - tcf_queue_work(&prog->work); -} - static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { @@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); if (tcf_exts_get_net(&prog->exts)) - call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); + tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work); else __cls_bpf_delete_prog(prog); } @@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); tcf_exts_get_net(&oldprog->exts); - call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); + tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work); } else { list_add_rcu(&prog->link, &head->plist); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 762da5c0cf5e..3bc01bdde165 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,10 +23,7 @@ struct cls_cgroup_head { struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head) static void cls_cgroup_destroy_work(struct work_struct *work) { - struct cls_cgroup_head *head = container_of(work, + struct cls_cgroup_head *head = container_of(to_rcu_work(work), struct cls_cgroup_head, - work); + rwork); rtnl_lock(); __cls_cgroup_destroy(head); rtnl_unlock(); } -static void cls_cgroup_destroy_rcu(struct rcu_head *root) -{ - struct cls_cgroup_head *head = container_of(root, - struct cls_cgroup_head, - rcu); - - INIT_WORK(&head->work, cls_cgroup_destroy_work); - tcf_queue_work(&head->work); -} - static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(tp->root, new); if (head) { tcf_exts_get_net(&head->exts); - call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); } return 0; errout: @@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp, /* Head can still be NULL due to cls_cgroup_init(). */ if (head) { if (tcf_exts_get_net(&head->exts)) - call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); else __cls_cgroup_destroy(head); } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index cd5fe383afdd..2bb043cd436b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -57,10 +57,7 @@ struct flow_filter { u32 divisor; u32 baseclass; u32 hashrnd; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static inline u32 addr_fold(void *addr) @@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f) static void flow_destroy_filter_work(struct work_struct *work) { - struct flow_filter *f = container_of(work, struct flow_filter, work); - + struct flow_filter *f = container_of(to_rcu_work(work), + struct flow_filter, + rwork); rtnl_lock(); __flow_destroy_filter(f); rtnl_unlock(); } -static void flow_destroy_filter(struct rcu_head *head) -{ - struct flow_filter *f = container_of(head, struct flow_filter, rcu); - - INIT_WORK(&f->work, flow_destroy_filter_work); - tcf_queue_work(&f->work); -} - static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (fold) { tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, flow_destroy_filter); + tcf_queue_work(&fold->rwork, flow_destroy_filter_work); } return 0; @@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, list_del_rcu(&f->list); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, flow_destroy_filter); + tcf_queue_work(&f->rwork, flow_destroy_filter_work); *last = list_empty(&head->filters); return 0; } @@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, flow_destroy_filter); + tcf_queue_work(&f->rwork, flow_destroy_filter_work); else __flow_destroy_filter(f); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index eacaaf803914..4e74508515f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -73,10 +73,7 @@ struct fl_flow_mask { struct cls_fl_head { struct rhashtable ht; struct list_head masks; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; struct idr handle_idr; }; @@ -90,10 +87,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; struct net_device *hw_dev; }; @@ -235,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f) static void fl_destroy_filter_work(struct work_struct *work) { - struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); + struct cls_fl_filter *f = container_of(to_rcu_work(work), + struct cls_fl_filter, rwork); rtnl_lock(); __fl_destroy_filter(f); rtnl_unlock(); } -static void fl_destroy_filter(struct rcu_head *head) -{ - struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); - - INIT_WORK(&f->work, fl_destroy_filter_work); - tcf_queue_work(&f->work); -} - static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { @@ -327,7 +314,7 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, fl_hw_destroy_filter(tp, f, extack); tcf_unbind_filter(tp, &f->res); if (async) - call_rcu(&f->rcu, fl_destroy_filter); + tcf_queue_work(&f->rwork, fl_destroy_filter_work); else __fl_destroy_filter(f); @@ -336,20 +323,13 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, static void fl_destroy_sleepable(struct work_struct *work) { - struct cls_fl_head *head = container_of(work, struct cls_fl_head, - work); + struct cls_fl_head *head = container_of(to_rcu_work(work), + struct cls_fl_head, + rwork); kfree(head); module_put(THIS_MODULE); } -static void fl_destroy_rcu(struct rcu_head *rcu) -{ - struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu); - - INIT_WORK(&head->work, fl_destroy_sleepable); - schedule_work(&head->work); -} - static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -365,7 +345,7 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) idr_destroy(&head->handle_idr); __module_get(THIS_MODULE); - call_rcu(&head->rcu, fl_destroy_rcu); + tcf_queue_work(&head->rwork, fl_destroy_sleepable); } static void *fl_get(struct tcf_proto *tp, u32 handle) @@ -1036,7 +1016,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&fold->list, &fnew->list); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, fl_destroy_filter); + tcf_queue_work(&fold->rwork, fl_destroy_filter_work); } else { list_add_tail_rcu(&fnew->list, &fnew->mask->filters); } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8b207723fbc2..29eeeaf3ea44 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -47,10 +47,7 @@ struct fw_filter { #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static u32 fw_hash(u32 handle) @@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f) static void fw_delete_filter_work(struct work_struct *work) { - struct fw_filter *f = container_of(work, struct fw_filter, work); - + struct fw_filter *f = container_of(to_rcu_work(work), + struct fw_filter, + rwork); rtnl_lock(); __fw_delete_filter(f); rtnl_unlock(); } -static void fw_delete_filter(struct rcu_head *head) -{ - struct fw_filter *f = container_of(head, struct fw_filter, rcu); - - INIT_WORK(&f->work, fw_delete_filter_work); - tcf_queue_work(&f->work); -} - static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); @@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); else __fw_delete_filter(f); } @@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); ret = 0; break; } @@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(*fp, fnew); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); *arg = fnew; return err; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2ba721a590a7..47b207ef7762 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,10 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head) static void mall_destroy_work(struct work_struct *work) { - struct cls_mall_head *head = container_of(work, struct cls_mall_head, - work); + struct cls_mall_head *head = container_of(to_rcu_work(work), + struct cls_mall_head, + rwork); rtnl_lock(); __mall_destroy(head); rtnl_unlock(); } -static void mall_destroy_rcu(struct rcu_head *rcu) -{ - struct cls_mall_head *head = container_of(rcu, struct cls_mall_head, - rcu); - - INIT_WORK(&head->work, mall_destroy_work); - tcf_queue_work(&head->work); -} - static void mall_destroy_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie, @@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); if (tcf_exts_get_net(&head->exts)) - call_rcu(&head->rcu, mall_destroy_rcu); + tcf_queue_work(&head->rwork, mall_destroy_work); else __mall_destroy(head); } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 21a03a8ee029..0404aa5fa7cb 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -57,10 +57,7 @@ struct route4_filter { u32 handle; struct route4_bucket *bkt; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) @@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f) static void route4_delete_filter_work(struct work_struct *work) { - struct route4_filter *f = container_of(work, struct route4_filter, work); - + struct route4_filter *f = container_of(to_rcu_work(work), + struct route4_filter, + rwork); rtnl_lock(); __route4_delete_filter(f); rtnl_unlock(); } -static void route4_delete_filter(struct rcu_head *head) +static void route4_queue_work(struct route4_filter *f) { - struct route4_filter *f = container_of(head, struct route4_filter, rcu); - - INIT_WORK(&f->work, route4_delete_filter_work); - tcf_queue_work(&f->work); + tcf_queue_work(&f->rwork, route4_delete_filter_work); } static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) @@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) RCU_INIT_POINTER(b->ht[h2], next); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, route4_delete_filter); + route4_queue_work(f); else __route4_delete_filter(f); } @@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, /* Delete it */ tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, route4_delete_filter); + tcf_queue_work(&f->rwork, route4_delete_filter_work); /* Strip RTNL protected tree */ for (i = 0; i <= 32; i++) { @@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (fold) { tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, route4_delete_filter); + tcf_queue_work(&fold->rwork, route4_delete_filter_work); } return 0; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 4f1297657c27..e9ccf7daea7d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -97,10 +97,7 @@ struct rsvp_filter { u32 handle; struct rsvp_session *sess; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) @@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f) static void rsvp_delete_filter_work(struct work_struct *work) { - struct rsvp_filter *f = container_of(work, struct rsvp_filter, work); - + struct rsvp_filter *f = container_of(to_rcu_work(work), + struct rsvp_filter, + rwork); rtnl_lock(); __rsvp_delete_filter(f); rtnl_unlock(); } -static void rsvp_delete_filter_rcu(struct rcu_head *head) -{ - struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); - - INIT_WORK(&f->work, rsvp_delete_filter_work); - tcf_queue_work(&f->work); -} - static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) * in cleanup() callback */ if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, rsvp_delete_filter_rcu); + tcf_queue_work(&f->rwork, rsvp_delete_filter_work); else __rsvp_delete_filter(f); } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b49cc990a000..32f4bbd82f35 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -28,20 +28,14 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; struct tcindex_filter { u16 key; struct tcindex_filter_result result; struct tcindex_filter __rcu *next; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; @@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work) { struct tcindex_filter_result *r; - r = container_of(work, struct tcindex_filter_result, work); + r = container_of(to_rcu_work(work), + struct tcindex_filter_result, + rwork); rtnl_lock(); __tcindex_destroy_rexts(r); rtnl_unlock(); } -static void tcindex_destroy_rexts(struct rcu_head *head) -{ - struct tcindex_filter_result *r; - - r = container_of(head, struct tcindex_filter_result, rcu); - INIT_WORK(&r->work, tcindex_destroy_rexts_work); - tcf_queue_work(&r->work); -} - static void __tcindex_destroy_fexts(struct tcindex_filter *f) { tcf_exts_destroy(&f->result.exts); @@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f) static void tcindex_destroy_fexts_work(struct work_struct *work) { - struct tcindex_filter *f = container_of(work, struct tcindex_filter, - work); + struct tcindex_filter *f = container_of(to_rcu_work(work), + struct tcindex_filter, + rwork); rtnl_lock(); __tcindex_destroy_fexts(f); rtnl_unlock(); } -static void tcindex_destroy_fexts(struct rcu_head *head) -{ - struct tcindex_filter *f = container_of(head, struct tcindex_filter, - rcu); - - INIT_WORK(&f->work, tcindex_destroy_fexts_work); - tcf_queue_work(&f->work); -} - static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, struct netlink_ext_ack *extack) { @@ -228,12 +207,12 @@ found: */ if (f) { if (tcf_exts_get_net(&f->result.exts)) - call_rcu(&f->rcu, tcindex_destroy_fexts); + tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); else __tcindex_destroy_fexts(f); } else { if (tcf_exts_get_net(&r->exts)) - call_rcu(&r->rcu, tcindex_destroy_rexts); + tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else __tcindex_destroy_rexts(r); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index bac47b5d18fd..fb861f90fde6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -68,10 +68,7 @@ struct tc_u_knode { u32 __percpu *pcpu_success; #endif struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. */ @@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, */ static void u32_delete_key_work(struct work_struct *work) { - struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); - + struct tc_u_knode *key = container_of(to_rcu_work(work), + struct tc_u_knode, + rwork); rtnl_lock(); u32_destroy_key(key->tp, key, false); rtnl_unlock(); } -static void u32_delete_key_rcu(struct rcu_head *rcu) -{ - struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - - INIT_WORK(&key->work, u32_delete_key_work); - tcf_queue_work(&key->work); -} - /* u32_delete_key_freepf_rcu is the rcu callback variant * that free's the entire structure including the statistics * percpu variables. Only use this if the key is not a copy @@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu) */ static void u32_delete_key_freepf_work(struct work_struct *work) { - struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); - + struct tc_u_knode *key = container_of(to_rcu_work(work), + struct tc_u_knode, + rwork); rtnl_lock(); u32_destroy_key(key->tp, key, true); rtnl_unlock(); } -static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) -{ - struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - - INIT_WORK(&key->work, u32_delete_key_freepf_work); - tcf_queue_work(&key->work); -} - static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode __rcu **kp; @@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); tcf_exts_get_net(&key->exts); - call_rcu(&key->rcu, u32_delete_key_freepf_rcu); + tcf_queue_work(&key->rwork, u32_delete_key_freepf_work); return 0; } } @@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) - call_rcu(&n->rcu, u32_delete_key_freepf_rcu); + tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else u32_destroy_key(n->tp, n, true); } @@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); - call_rcu(&n->rcu, u32_delete_key_rcu); + tcf_queue_work(&n->rwork, u32_delete_key_work); return 0; } diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 074fb2b2d51c..04f073146256 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o - diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2b47a1dd7c6c..87998818116f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/init.h> @@ -25,39 +16,25 @@ #define XDP_UMEM_MIN_FRAME_SIZE 2048 -int xdp_umem_create(struct xdp_umem **umem) -{ - *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - - if (!(*umem)) - return -ENOMEM; - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; - if (umem->pgs) { - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; - kfree(umem->pgs); - umem->pgs = NULL; + set_page_dirty_lock(page); + put_page(page); } + + kfree(umem->pgs); + umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - if (umem->user) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); - } + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); } static void xdp_umem_release(struct xdp_umem *umem) @@ -75,22 +52,18 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - if (umem->pgs) { - xdp_umem_unpin_pages(umem); - - task = get_pid_task(umem->pid, PIDTYPE_PID); - put_pid(umem->pid); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; + xdp_umem_unpin_pages(umem); - mmput(mm); - umem->pgs = NULL; - } + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + mmput(mm); xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -105,7 +78,7 @@ static void xdp_umem_release_deferred(struct work_struct *work) void xdp_get_umem(struct xdp_umem *umem) { - atomic_inc(&umem->users); + refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem) @@ -113,7 +86,7 @@ void xdp_put_umem(struct xdp_umem *umem) if (!umem) return; - if (atomic_dec_and_test(&umem->users)) { + if (refcount_dec_and_test(&umem->users)) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } @@ -176,16 +149,13 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; u64 addr = mr->addr, size = mr->len; unsigned int nframes, nfpp; int size_chk, err; - if (!umem) - return -EINVAL; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* @@ -236,7 +206,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->frame_size_log2 = ilog2(frame_size); umem->nfpp_mask = nfpp - 1; umem->nfpplog2 = ilog2(nfpp); - atomic_set(&umem->users, 1); + refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) @@ -254,7 +224,25 @@ out: return err; } +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) +{ + struct xdp_umem *umem; + int err; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + err = xdp_umem_reg(umem, mr); + if (err) { + kfree(umem); + return ERR_PTR(err); + } + + return umem; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return (umem->fq && umem->cq); + return umem->fq && umem->cq; } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 7e0b2fab8522..0881cf456230 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_H_ @@ -36,7 +27,7 @@ struct xdp_umem { struct pid *pid; unsigned long address; size_t size; - atomic_t users; + refcount_t users; struct work_struct work; }; @@ -59,9 +50,8 @@ static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, } bool xdp_umem_validate_queues(struct xdp_umem *umem); -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); -int xdp_umem_create(struct xdp_umem **umem); +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 77fb5daf29f3..2cf8ec485fd2 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_PROPS_H_ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 009c5af5bba5..cce0e4f8a536 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -5,15 +5,6 @@ * applications. * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel <bjorn.topel@intel.com> * Magnus Karlsson <magnus.karlsson@intel.com> */ @@ -151,6 +142,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } + if (xs->queue_id >= xs->dev->real_num_tx_queues) { + err = -ENXIO; + goto out; + } + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); if (unlikely(!skb)) { err = -EAGAIN; @@ -232,18 +228,12 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, if (!q) return -ENOMEM; + /* Make sure queue is ready before it can be seen by others */ + smp_wmb(); *queue = q; return 0; } -static void __xsk_release(struct xdp_sock *xs) -{ - /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - - dev_put(xs->dev); -} - static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -260,7 +250,9 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { - __xsk_release(xs); + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + dev_put(xs->dev); xs->dev = NULL; } @@ -294,9 +286,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; - struct net_device *dev, *dev_curr; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *old_umem = NULL; + struct net_device *dev; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -305,7 +296,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - dev_curr = xs->dev; + if (xs->dev) { + err = -EBUSY; + goto out_release; + } + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; @@ -317,7 +312,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || + (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } @@ -352,7 +348,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xdp_get_umem(umem_xs->umem); - old_umem = xs->umem; xs->umem = umem_xs->umem; sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { @@ -364,14 +359,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xskq_set_umem(xs->umem->cq, &xs->umem->props); } - /* Rebind? */ - if (dev_curr && (dev_curr != dev || - xs->queue_id != sxdp->sxdp_queue_id)) { - __xsk_release(xs); - if (old_umem) - xdp_put_umem(old_umem); - } - xs->dev = dev; xs->queue_id = sxdp->sxdp_queue_id; @@ -419,25 +406,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xdp_umem_reg mr; struct xdp_umem *umem; - if (xs->umem) - return -EBUSY; - if (copy_from_user(&mr, optval, sizeof(mr))) return -EFAULT; mutex_lock(&xs->mutex); - err = xdp_umem_create(&umem); + if (xs->umem) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } - err = xdp_umem_reg(umem, &mr); - if (err) { - kfree(umem); + umem = xdp_umem_create(&mr); + if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); - return err; + return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; mutex_unlock(&xs->mutex); return 0; @@ -448,13 +433,15 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (!xs->umem) - return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); + if (!xs->umem) { + mutex_unlock(&xs->mutex); + return -EINVAL; + } + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); @@ -504,6 +491,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_MMAP_OFFSETS: + { + struct xdp_mmap_offsets off; + + if (len < sizeof(off)) + return -EINVAL; + + off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); + off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); + + off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.fr.desc = offsetof(struct xdp_umem_ring, desc); + off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.cr.desc = offsetof(struct xdp_umem_ring, desc); + + len = sizeof(off); + if (copy_to_user(optval, &off, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } @@ -518,21 +534,23 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; + struct xdp_umem *umem; unsigned long pfn; struct page *qpg; if (offset == XDP_PGOFF_RX_RING) { - q = xs->rx; + q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { - q = xs->tx; + q = READ_ONCE(xs->tx); } else { - if (!xs->umem) + umem = READ_ONCE(xs->umem); + if (!umem) return -EINVAL; if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; + q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = xs->umem->cq; + q = READ_ONCE(umem->cq); } if (!q) @@ -554,24 +572,24 @@ static struct proto xsk_proto = { }; static const struct proto_ops xsk_proto_ops = { - .family = PF_XDP, - .owner = THIS_MODULE, - .release = xsk_release, - .bind = xsk_bind, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = sock_no_getname, - .poll = xsk_poll, - .ioctl = sock_no_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = xsk_setsockopt, - .getsockopt = xsk_getsockopt, - .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, - .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = xsk_mmap, + .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d012e5e23591..ebe85e59507e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/slab.h> @@ -31,8 +22,7 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) { - return (sizeof(struct xdp_ring) + - q->nentries * sizeof(struct xdp_desc)); + return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7aa9a535db0e..cb8e5be35110 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space ring structure +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XSK_QUEUE_H @@ -22,6 +13,23 @@ #define RX_BATCH_SIZE 16 +struct xdp_ring { + u32 producer ____cacheline_aligned_in_smp; + u32 consumer ____cacheline_aligned_in_smp; +}; + +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] ____cacheline_aligned_in_smp; +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + u32 desc[0] ____cacheline_aligned_in_smp; +}; + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -232,12 +240,12 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return (xskq_nb_avail(q, q->nentries) == q->nentries); + return xskq_nb_avail(q, q->nentries) == q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); + return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); |