From d0522f1cd25edb796548f91e04766fa3cbc3b6df Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Nov 2018 12:51:14 -0800 Subject: net: Add extack argument to rtnl_create_link Add extack arg to rtnl_create_link and add messages for invalid number of Tx or Rx queues. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33d9227a8b80..f787b7640d49 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2885,9 +2885,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2905,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -3163,7 +3169,7 @@ replay: } dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); + name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; -- cgit v1.2.1 From 68d57f3b1d1a87b4022a0a7463126d7ea172bda1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Nov 2018 12:51:16 -0800 Subject: rtnetlink: Add more extack messages to rtnl_newlink Add extack arg to the nla_parse_nested calls in rtnl_newlink, and add messages for unknown device type and link network namespace id. In particular, it improves the failure message when the wrong link type is used. From $ ip li add bond1 type bonding RTNETLINK answers: Operation not supported to $ ip li add bond1 type bonding Error: Unknown device type. (The module name is bonding but the link type is bond.) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f787b7640d49..86f2d9cbdae3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3054,7 +3054,7 @@ replay: if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); + ops->policy, extack); if (err < 0) return err; data = attr; @@ -3076,7 +3076,7 @@ replay: m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, - NULL); + extack); if (err < 0) return err; slave_data = slave_attr; @@ -3140,6 +3140,7 @@ replay: goto replay; } #endif + NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } @@ -3160,6 +3161,7 @@ replay: link_net = get_net_ns_by_id(dest_net, id); if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; goto out; } -- cgit v1.2.1 From 6da5b0f027a825df2aebc1927a27bda185dc03d4 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Wed, 7 Nov 2018 15:36:04 +0000 Subject: net: ensure unbound datagram socket to be chosen when not in a VRF Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 080a880a1761..7b304e454a38 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); -- cgit v1.2.1 From 9b319148cb34ecccacff09eca87765c87d5e19ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 7 Nov 2018 18:07:03 +0100 Subject: net/vlan: include the shift in skb_vlan_tag_get_prio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..56d1e9b73142 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -952,8 +952,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; -- cgit v1.2.1 From 50254256f382c56bde87d970f3d0d02fdb76ec70 Mon Sep 17 00:00:00 2001 From: David Barmann Date: Thu, 8 Nov 2018 08:13:35 -0600 Subject: sock: Reset dst when changing sk_mark via setsockopt When setting the SO_MARK socket option, if the mark changes, the dst needs to be reset so that a new route lookup is performed. This fixes the case where an application wants to change routing by setting a new sk_mark. If this is done after some packets have already been sent, the dst is cached and has no effect. Signed-off-by: David Barmann Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 7b304e454a38..6d7e189e3cd9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -952,10 +952,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: -- cgit v1.2.1 From e7946760de5852f32c4e52ce47f37e85346981b9 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 8 Nov 2018 22:27:54 +0200 Subject: net: core: dev_addr_lists: add auxiliary func to handle reference address updates In order to avoid all table update, and only remove or add new address, the auxiliary function exists, named __hw_addr_sync_dev(). It allows end driver do nothing when nothing changed and add/rm when concrete address is firstly added or lastly removed. But it doesn't include cases when an address of real device or vlan was reused by other vlans or vlan/macval devices. For handaling events when address was reused/unreused the patch adds new auxiliary routine - __hw_addr_ref_sync_dev(). It allows to do nothing when nothing was changed and do updates only for an address being added/reused/deleted/unreused. Thus, clone address changes for vlans can be mirrored in the table. The function is exclusive with __hw_addr_sync_dev(). It's responsibility of the end driver to identify address vlan device, if it needs so. Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..81a8cd4ea3bd 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -277,6 +277,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, } EXPORT_SYMBOL(__hw_addr_sync_dev); +/** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + /** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from -- cgit v1.2.1 From 49f8e8329c3c05e78a75112ce006c41d24eaad0d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 8 Nov 2018 14:05:42 -0800 Subject: net: move __skb_checksum_complete*() to skbuff.c __skb_checksum_complete_head() and __skb_checksum_complete() are both declared in skbuff.h, they fit better in skbuff.c than datagram.c. Cc: Stefano Brivio Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 43 ------------------------------------------- net/core/skbuff.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 43 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..07983b90d2bd 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -728,49 +728,6 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete); - /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b4ee5c8b928f..5cb4b3440153 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2645,6 +2645,49 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + /* skb->csum holds pseudo checksum */ + sum = csum_fold(csum_add(skb->csum, csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( -- cgit v1.2.1 From b1817524c028a5a5284f21358185c74790001e0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 9 Nov 2018 00:18:02 +0100 Subject: net/core: use __vlan_hwaccel helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes assumptions about VLAN_TAG_PRESENT bit. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- net/core/skbuff.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0ffcbdd55fa9..bf7e0a471186 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4889,7 +4889,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5386,7 +5386,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5652,7 +5654,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; skb->encapsulation = 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5cb4b3440153..396fcb3baad0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5166,7 +5166,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; -- cgit v1.2.1 From a5a3a828cd00788a78da686c57c6d1f66191d8af Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 7 Nov 2018 16:12:01 -0800 Subject: bpf: add perf event notificaton support for sock_ops This patch allows eBPF programs that use sock_ops to send perf based event notifications using bpf_perf_event_output(). Our main use case for this is the following: We would like to monitor some subset of TCP sockets in user-space, (the monitoring application would define 4-tuples it wants to monitor) using TCP_INFO stats to analyze reported problems. The idea is to use those stats to see where the bottlenecks are likely to be ("is it application-limited?" or "is there evidence of BufferBloat in the path?" etc). Today we can do this by periodically polling for tcp_info, but this could be made more efficient if the kernel would asynchronously notify the application via tcp_info when some "interesting" thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc) are reached. And to make this effective, it is better if we could apply the threshold check *before* constructing the tcp_info netlink notification, so that we don't waste resources constructing notifications that will be discarded by the filter. This work solves the problem by adding perf event based notification support for sock_ops. The eBPF program can thus be designed to apply any desired filters to the bpf_sock_ops and trigger a perf event notification based on the evaluation from the filter. The user space component can use these perf event notifications to either read any state managed by the eBPF program, or issue a TCP_INFO netlink call if desired. Signed-off-by: Sowmini Varadhan Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..ba97a6bee6f9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3908,6 +3908,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5240,6 +5260,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.1 From c8123ead13a5c92dc5fd15c0fdfe88eef41e6ac1 Mon Sep 17 00:00:00 2001 From: Nitin Hande Date: Sun, 28 Oct 2018 21:02:45 -0700 Subject: bpf: Extend the sk_lookup() helper to XDP hookpoint. This patch proposes to extend the sk_lookup() BPF API to the XDP hookpoint. The sk_lookup() helper supports a lookup on incoming packet to find the corresponding socket that will receive this packet. Current support for this BPF API is at the tc hookpoint. This patch will extend this API at XDP hookpoint. A XDP program can map the incoming packet to the 5-tuple parameter and invoke the API to find the corresponding socket structure. Signed-off-by: Nitin Hande Signed-off-by: Daniel Borkmann --- net/core/filter.c | 105 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 86 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index ba97a6bee6f9..53d50fb75ea1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4845,38 +4845,32 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, &refcounted); @@ -4885,7 +4879,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4902,31 +4896,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); + sdif = inet6_sdif(skb); + if (netns_id) { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } else { net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } if (sk) @@ -4935,6 +4931,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4984,6 +4999,50 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5234,6 +5293,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.1 From 76c6d988aeb3c15d57ea0c245a3b5f27802c1fbe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Nov 2018 18:27:16 +0800 Subject: sctp: add sock_reuseport for the sock in __sctp_hash_endpoint This is a part of sk_reuseport support for sctp. It defines a helper sctp_bind_addrs_check() to check if the bind_addrs in two socks are matched. It will add sock_reuseport if they are completely matched, and return err if they are partly matched, and alloc sock_reuseport if all socks are not matched at all. It will work until sk_reuseport support is added in sctp_get_port_local() in the next patch. v1->v2: - use 'laddr->valid && laddr2->valid' check instead as Marcelo pointed in sctp_bind_addrs_check(). Acked-by: Neil Horman Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { -- cgit v1.2.1 From 7fe50ac83f4319c18ed7c634d85cad16bd0bf509 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Nov 2018 14:47:18 -0800 Subject: net: dump more useful information in netdev_rx_csum_fault() Currently netdev_rx_csum_fault() only shows a device name, we need more information about the skb for debugging csum failures. Sample output: ens3: hw csum failure dev features: 0x0000000000014b89 skb len=84 data_len=0 pkt_type=0 gso_size=0 gso_type=0 nr_frags=0 ip_summed=0 csum=0 csum_complete_sw=0 csum_valid=0 csum_level=0 Note, I use pr_err() just to be consistent with the existing one. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- net/core/dev.c | 11 +++++++++-- net/core/skbuff.c | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 07983b90d2bd..4bf62b1afa3b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -767,7 +767,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index bf7e0a471186..5927f6a7c301 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3091,10 +3091,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : ""); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -5781,7 +5788,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 396fcb3baad0..fcb1155a00ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2653,7 +2653,7 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) skb->csum_valid = !sum; @@ -2673,7 +2673,7 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) { -- cgit v1.2.1 From 982c17b9e3c27389a8d214333c686dab0e95cf63 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 13 Nov 2018 09:16:52 +0800 Subject: net: remove BUG_ON from __pskb_pull_tail if list is NULL pointer, and the following access of list will trigger panic, which is same as BUG_ON Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fcb1155a00ec..f95ab41c9fb9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1925,8 +1925,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; -- cgit v1.2.1 From cac6cc2f5ac710334ae0f6bba5630d791c253574 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:00 -0800 Subject: bpf: Fix IPv6 dport byte order in bpf_sk_lookup_udp Lookup functions in sk_lookup have different expectations about byte order of provided arguments. Specifically __inet_lookup, __udp4_lib_lookup and __udp6_lib_lookup expect dport to be in network byte order and do ntohs(dport) internally. At the same time __inet6_lookup expects dport to be in host byte order and correspondingly name the argument hnum. sk_lookup works correctly with __inet_lookup, __udp4_lib_lookup and __inet6_lookup with regard to dport. But in __udp6_lib_lookup case it uses host instead of expected network byte order. It makes result returned by bpf_sk_lookup_udp for IPv6 incorrect. The patch fixes byte order of dport passed to __udp6_lib_lookup. Originally sk_lookup properly handled UDPv6, but not TCPv6. 5ef0ae84f02a fixes TCPv6 but breaks UDPv6. Fixes: 5ef0ae84f02a ("bpf: Fix IPv6 dport byte-order in bpf_sk_lookup") Signed-off-by: Andrey Ignatov Acked-by: Joe Stringer Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 53d50fb75ea1..f4ae933edf61 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4867,17 +4867,16 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, &udp_table, NULL); #endif -- cgit v1.2.1 From 6c49e65e0d462963b4fac97ebd87014342167027 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:01 -0800 Subject: bpf: Support socket lookup in CGROUP_SOCK_ADDR progs Make bpf_sk_lookup_tcp, bpf_sk_lookup_udp and bpf_sk_release helpers available in programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Such programs operate on sockets and have access to socket and struct sockaddr passed by user to system calls such as sys_bind, sys_connect, sys_sendmsg. It's useful to be able to lookup other sockets from these programs. E.g. sys_connect may lookup IP:port endpoint and if there is a server socket bound to that endpoint ("server" can be defined by saddr & sport being zero), redirect client connection to it by rewriting IP:port in sockaddr passed to sys_connect. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f4ae933edf61..f6ca38a7d433 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5042,6 +5042,43 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5148,6 +5185,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } -- cgit v1.2.1 From 9c2122559709fe35f493634bc61065b8c0ecb2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Sat, 10 Nov 2018 19:58:35 +0100 Subject: net/bpf: split VLAN_PRESENT bit handling from VLAN_TCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/filter.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..c151b906df53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,21 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ +#ifdef VLAN_TAG_PRESENT + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); +#endif + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -6140,19 +6139,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } +#ifdef VLAN_TAG_PRESENT + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); +#endif break; case offsetof(struct __sk_buff, cb[0]) ... -- cgit v1.2.1 From 0c4b2d370514cb4f3454dd3b18f031d2651fab73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Sat, 10 Nov 2018 19:58:36 +0100 Subject: net: remove VLAN_TAG_PRESENT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace VLAN_TAG_PRESENT with single bit flag and free up VLAN.CFI overload. Now VLAN.CFI is visible in networking stack and can be passed around intact. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/filter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index c151b906df53..10acbc00ff6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -301,9 +301,6 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case SKF_AD_VLAN_TAG_PRESENT: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); @@ -6152,9 +6149,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case offsetof(struct __sk_buff, cb[0]) ... -- cgit v1.2.1 From 7f600f14dfac4ba4aee6283a415cdad2925d7791 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Nov 2018 18:05:24 -0800 Subject: net: remove unused skb_send_sock() Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/skbuff.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f95ab41c9fb9..a1be7f19d998 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2364,19 +2364,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.1 From 6f9a50691055618b1042ead4d84f80755d1b9315 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 19 Nov 2018 16:11:07 +0000 Subject: net: skb_scrub_packet(): Scrub offload_fwd_mark When a packet is trapped and the corresponding SKB marked as already-forwarded, it retains this marking even after it is forwarded across veth links into another bridge. There, since it ingresses the bridge over veth, which doesn't have offload_fwd_mark, it triggers a warning in nbp_switchdev_frame_mark(). Then nbp_switchdev_allowed_egress() decides not to allow egress from this bridge through another veth, because the SKB is already marked, and the mark (of 0) of course matches. Thus the packet is incorrectly blocked. Solve by resetting offload_fwd_mark() in skb_scrub_packet(). That function is called from tunnels and also from veth, and thus catches the cases where traffic is forwarded between bridges and transformed in a way that invalidates the marking. Signed-off-by: Petr Machata Suggested-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a1be7f19d998..9a8a72cefe9b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4882,6 +4882,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_mr_fwd_mark = 0; +#endif + if (!xnet) return; -- cgit v1.2.1 From f11216b24219ab26d8d159fbfa12dff886b16e32 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Thu, 22 Nov 2018 14:39:16 -0500 Subject: bpf: add skb->tstamp r/w access from tc clsact and cg skb progs This could be used to rate limit egress traffic in concert with a qdisc which supports Earliest Departure Time, such as FQ. Write access from cg skb progs only with CAP_SYS_ADMIN, since the value will be used by downstream qdiscs. It might make sense to relax this. Changes v1 -> v2: - allow access from cg skb, write only with CAP_SYS_ADMIN Signed-off-by: Vlad Dumitrescu Acked-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f6ca38a7d433..65dc13aeca7c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5573,6 +5573,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(struct bpf_flow_keys *)) return false; break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5600,6 +5604,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5638,6 +5643,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5665,6 +5674,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5874,6 +5884,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -6093,6 +6104,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6179,6 +6191,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6488,6 +6501,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); } return insn - insn_buf; -- cgit v1.2.1 From 42519ede4fde2a50919215933e0f02c8342aba0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Nov 2018 11:39:28 -0800 Subject: net-gro: use ffs() to speedup napi_gro_flush() We very often have few flows/chains to look at, and we might increase GRO_HASH_BUCKETS to 32 or 64 in the future. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f2bfd2eda7b2..d83582623cd7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5364,11 +5364,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); -- cgit v1.2.1 From 4bffc669d6248d655aeb985a0e51bfaaf21c8b40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 25 Nov 2018 08:26:23 -0800 Subject: net: remove unsafe skb_insert() I do not see how one can effectively use skb_insert() without holding some kind of lock. Otherwise other cpus could have changed the list right before we have a chance of acquiring list->lock. Only existing user is in drivers/infiniband/hw/nes/nes_mgt.c and this one probably meant to use __skb_insert() since it appears nesqp->pau_list is protected by nesqp->pau_lock. This looks like nesqp->pau_lock could be removed, since nesqp->pau_list.lock could be used instead. Signed-off-by: Eric Dumazet Cc: Faisal Latif Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-rdma Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9a8a72cefe9b..02cd7ae3d0fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2990,28 +2990,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) -- cgit v1.2.1 From d8f3e978bd30014081c599456285ca1d58c2aae1 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 26 Nov 2018 13:42:41 -0800 Subject: bpf: Avoid unnecessary instruction in convert_bpf_ld_abs() 'offset' is constant and if it is zero, no need to subtract it from BPF_REG_TMP. Signed-off-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index aa274679965d..f50ea971f7a9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -463,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { -- cgit v1.2.1 From a428afe82f98d2ffb31c981671630df1fa25906f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 24 Nov 2018 04:34:20 +0200 Subject: net: bridge: add support for user-controlled bool options We have been adding many new bridge options, a big number of which are boolean but still take up netlink attribute ids and waste space in the skb. Recently we discussed learning from link-local packets[1] and decided yet another new boolean option will be needed, thus introducing this API to save some bridge nl space. The API supports changing the value of multiple boolean options at once via the br_boolopt_multi struct which has an optmask (which options to set, bit per opt) and optval (options' new values). Future boolean options will only be added to the br_boolopt_id enum and then will have to be handled in br_boolopt_toggle/get. The API will automatically add the ability to change and export them via netlink, sysfs can use the single boolopt function versions to do the same. The behaviour with failing/succeeding is the same as with normal netlink option changing. If an option requires mapping to internal kernel flag or needs special configuration to be enabled then it should be handled in br_boolopt_toggle. It should also be able to retrieve an option's current state via br_boolopt_get. v2: WARN_ON() on unsupported option as that shouldn't be possible and also will help catch people who add new options without handling them for both set and get. Pass down extack so if an option desires it could set it on error and be more user-friendly. [1] https://www.spinics.net/lists/netdev/msg532698.html Signed-off-by: Nikolay Aleksandrov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 86f2d9cbdae3..a498bb41c9aa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include #include -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { -- cgit v1.2.1 From 74be39ebba36e98a973ddb914fb41dc9e5129e36 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:02 +0100 Subject: netns: remove net arg from rtnl_net_fill() This argument is not used anymore. Fixes: cab3c8ec8d57 ("netns: always provide the id to rtnl_net_fill()") Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..52b9620e3457 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -739,7 +739,7 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) + int cmd, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; @@ -801,7 +801,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + RTM_NEWNSID, id); if (err < 0) goto err_out; @@ -816,7 +816,6 @@ out: } struct rtnl_net_dump_cb { - struct net *net; struct sk_buff *skb; struct netlink_callback *cb; int idx; @@ -833,7 +832,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + RTM_NEWNSID, id); if (ret < 0) return ret; @@ -846,7 +845,6 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, .skb = skb, .cb = cb, .idx = 0, @@ -876,7 +874,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, 0, 0, 0, cmd, id); if (err < 0) goto err_out; -- cgit v1.2.1 From a0732ad14d40ee7562c8c6e04b01af6134c82831 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:03 +0100 Subject: netns: introduce 'struct net_fill_args' This is a preparatory work. To avoid having to much arguments for the function rtnl_net_fill(), a new structure is defined. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 52b9620e3457..f8a5966b086c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -738,20 +738,28 @@ static int rtnl_net_get_size(void) ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +775,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; struct nlattr *nla; struct sk_buff *msg; struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -799,9 +812,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, id); + fillargs.nsid = peernet2id(net, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -817,7 +829,7 @@ out: struct rtnl_net_dump_cb { struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -830,9 +842,8 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, id); + net_cb->fillargs.nsid = id; + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -846,7 +857,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; @@ -867,6 +883,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -874,7 +894,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; -- cgit v1.2.1 From cff478b9d9ccaee0de0e02700c63addf007b5d3c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:04 +0100 Subject: netns: add support of NETNSA_TARGET_NSID Like it was done for link and address, add the ability to perform get/dump in another netns by specifying a target nsid attribute. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 86 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f8a5966b086c..885c54197e31 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -780,9 +781,10 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, .seq = nlh->nlmsg_seq, .cmd = RTM_NEWNSID, }; + struct net *peer, *target = net; + bool put_target = false; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, @@ -806,13 +808,27 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + put_target = true; + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - fillargs.nsid = peernet2id(net, peer); + fillargs.nsid = peernet2id(target, peer); err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -823,15 +839,19 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (put_target) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { + struct net *tgt_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; + bool put_tgt_net; }; static int rtnl_net_dumpid_one(int id, void *peer, void *data) @@ -852,10 +872,50 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->tgt_net = net; + net_cb->put_tgt_net = true; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { + .tgt_net = sock_net(skb->sk), .skb = skb, .fillargs = { .portid = NETLINK_CB(cb->skb).portid, @@ -866,19 +926,23 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.put_tgt_net) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) -- cgit v1.2.1 From 3a4f68bf660414801781fd06506a9c75c2d936e5 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:05 +0100 Subject: netns: enable to specify a nsid for a get request Combined with NETNSA_TARGET_NSID, it enables to "translate" a nsid from one netns to a nsid of another netns. This is useful when using NETLINK_F_LISTEN_ALL_NSID because it helps the user to interpret a nsid received from an other netns. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 885c54197e31..dd25fb22ad45 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -797,6 +797,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; -- cgit v1.2.1 From 288f06a001eb6265122c620295b68a0dd53d1482 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Nov 2018 15:42:06 +0100 Subject: netns: enable to dump full nsid translation table Like the previous patch, the goal is to ease to convert nsids from one netns to another netns. A new attribute (NETNSA_CURRENT_NSID) is added to the kernel answer when NETNSA_TARGET_NSID is provided, thus the user can easily convert nsids. Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/net_namespace.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index dd25fb22ad45..05b23b285058 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -736,6 +736,7 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } @@ -745,6 +746,8 @@ struct net_fill_args { int flags; int cmd; int nsid; + bool add_ref; + int ref_nsid; }; static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) @@ -763,6 +766,10 @@ static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -782,7 +789,6 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, .cmd = RTM_NEWNSID, }; struct net *peer, *target = net; - bool put_target = false; struct nlattr *nla; struct sk_buff *msg; int err; @@ -824,7 +830,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err = PTR_ERR(target); goto out; } - put_target = true; + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); @@ -844,7 +851,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: - if (put_target) + if (fillargs.add_ref) put_net(target); put_net(peer); return err; @@ -852,11 +859,11 @@ out: struct rtnl_net_dump_cb { struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; - bool put_tgt_net; }; static int rtnl_net_dumpid_one(int id, void *peer, void *data) @@ -868,6 +875,8 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) goto cont; net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -904,8 +913,9 @@ static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, "Invalid target network namespace id"); return PTR_ERR(net); } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; net_cb->tgt_net = net; - net_cb->put_tgt_net = true; } else { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, @@ -940,12 +950,22 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) } spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; end: - if (net_cb.put_tgt_net) + if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); return err < 0 ? err : skb->len; } -- cgit v1.2.1 From 7246d8ed4dcce23f7509949a77be15fa9f0e3d28 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 26 Nov 2018 14:16:17 -0800 Subject: bpf: helper to pop data from messages This adds a BPF SK_MSG program helper so that we can pop data from a msg. We use this to pop metadata from a previous push data call. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f50ea971f7a9..bd0df75dc7b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2425,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -5098,6 +5266,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5394,6 +5563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.1 From 1464193107da8041e05341388964733bbba3be27 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 26 Nov 2018 09:31:26 -0800 Subject: net: explain __skb_checksum_complete() with comments Cc: Herbert Xu Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 1 + net/core/skbuff.c | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f69b2fcdee40..abe50c424b29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5791,6 +5791,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 02cd7ae3d0fb..3c814565ed7c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2635,6 +2635,7 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) @@ -2646,6 +2647,15 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_checksum_complete_head); +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ __sum16 __skb_checksum_complete(struct sk_buff *skb) { __wsum csum; @@ -2653,8 +2663,14 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) csum = skb_checksum(skb, 0, skb->len, 0); - /* skb->csum holds pseudo checksum */ sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) -- cgit v1.2.1 From b0e3f1bdf9e7140fd1151af575f468b5827a61e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Genevi=C3=A8ve=20Bastien?= Date: Tue, 27 Nov 2018 12:52:39 -0500 Subject: net: Add trace events for all receive exit points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace events are already present for the receive entry points, to indicate how the reception entered the stack. This patch adds the corresponding exit trace events that will bound the reception such that all events occurring between the entry and the exit can be considered as part of the reception context. This greatly helps for dependency and root cause analyses. Without this, it is not possible with tracepoint instrumentation to determine whether a sched_wakeup event following a netif_receive_skb event is the result of the packet reception or a simple coincidence after further processing by the thread. It is possible using other mechanisms like kretprobes, but considering the "entry" points are already present, it would be good to add the matching exit events. In addition to linking packets with wakeups, the entry/exit event pair can also be used to perform network stack latency analyses. Signed-off-by: Geneviève Bastien CC: Mathieu Desnoyers CC: Steven Rostedt CC: Ingo Molnar CC: David S. Miller Reviewed-by: Steven Rostedt (VMware) (tracing side) Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index abe50c424b29..04a6b7100aac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4527,9 +4527,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4544,6 +4549,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -5229,9 +5235,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5251,9 +5262,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5645,12 +5659,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5768,6 +5787,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5775,7 +5795,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); -- cgit v1.2.1 From 420d031822737ef570df6834c0eae26f33988f20 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Nov 2018 22:32:30 -0800 Subject: rtnetlink: remove a level of indentation in rtnl_newlink() rtnl_newlink() used to create VLAs based on link kind. Since commit ccf8dbcd062a ("rtnetlink: Remove VLA usage") statically sized array is created on the stack, so there is no more use for a separate code block that used to be the VLA's live range. While at it christmas tree the variables. Note that there is a goto-based retry so to be on the safe side the variables can no longer be initialized in place. It doesn't seem to matter, logically, but why make the code harder to read.. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 313 +++++++++++++++++++++++++-------------------------- 1 file changed, 154 insertions(+), 159 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a498bb41c9aa..ee2caa2fd940 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2974,17 +2974,22 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3040,195 +3045,185 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ops->setup) + return -EOPNOTSUPP; - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.1 From a293974590cfdc2d59c559a54d62a5ecb648104b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Nov 2018 22:32:31 -0800 Subject: rtnetlink: avoid frame size warning in rtnl_newlink() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard kernel compilation produces the following warning: net/core/rtnetlink.c: In function ‘rtnl_newlink’: net/core/rtnetlink.c:3232:1: warning: the frame size of 1288 bytes is larger than 1024 bytes [-Wframe-larger-than=] } ^ This should not really be an issue, as rtnl_newlink() stack is generally quite shallow. Fix the warning by allocating attributes with kmalloc() in a wrapper and passing it down to rtnl_newlink(), avoiding complexities on error paths. Alternatively we could kmalloc() some structure within rtnl_newlink(), slave attributes look like a good candidate. In practice it adds to already rather high complexity and length of the function. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ee2caa2fd940..98876cd1e36c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2971,14 +2971,13 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; const struct rtnl_link_ops *m_ops = NULL; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -3226,6 +3225,21 @@ out_unregister: goto out; } +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; +} + static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { -- cgit v1.2.1 From e3da08d057002f9d0831949d51666c3e15dc6b29 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Sun, 2 Dec 2018 20:18:19 -0500 Subject: bpf: allow BPF read access to qdisc pkt_len The pkt_len field in qdisc_skb_cb stores the skb length as it will appear on the wire after segmentation. For byte accounting, this value is more accurate than skb->len. It is computed on entry to the TC layer, so only valid there. Allow read access to this field from BPF tc classifier and action programs. The implementation is analogous to tc_classid, aside from restricting to read access. To distinguish it from skb->len and self-describe export as wire_len. Changes v1->v2 - Rename pkt_len to wire_len Signed-off-by: Petar Penkov Signed-off-by: Vlad Dumitrescu Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bd0df75dc7b6..3d54af4c363d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5773,6 +5773,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5797,6 +5798,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5843,6 +5845,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6273,6 +6276,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6360,6 +6364,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6685,6 +6690,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tstamp, 8, target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; -- cgit v1.2.1 From 846e980a87fc30075517d6d979548294d5461bdb Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Mon, 3 Dec 2018 07:58:59 +0000 Subject: devlink: Add 'fw_load_policy' generic parameter Many drivers load the device's firmware image during the initialization flow either from the flash or from the disk. Currently this option is not controlled by the user and the driver decides from where to load the firmware image. 'fw_load_policy' gives the ability to control this option which allows the user to choose between different loading policies supported by the driver. This parameter can be useful while testing and/or debugging the device. For example, testing a firmware bug fix. Signed-off-by: Shalom Toledo Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/devlink.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.1 From b5947e5d1e710c35ea281247bd27e6975250285c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:39 -0500 Subject: udp: msg_zerocopy Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and interpret flag MSG_ZEROCOPY. This patch was previously part of the zerocopy RFC patchsets. Zerocopy is not effective at small MTU. With segmentation offload building larger datagrams, the benefit of page flipping outweights the cost of generating a completion notification. tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on test patch and making skb_orphan_frags_rx same as skb_orphan_frags: ipv4 udp -t 1 tx=191312 (11938 MB) txc=0 zc=n rx=191312 (11938 MB) ipv4 udp -z -t 1 tx=304507 (19002 MB) txc=304507 zc=y rx=304507 (19002 MB) ok ipv6 udp -t 1 tx=174485 (10888 MB) txc=0 zc=n rx=174485 (10888 MB) ipv6 udp -z -t 1 tx=294801 (18396 MB) txc=294801 zc=y rx=294801 (18396 MB) ok Changes v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data. This is needed since commit 5cf4a8532c99 ("tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp. Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++++++ net/core/sock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3c814565ed7c..1350901c5cb8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) diff --git a/net/core/sock.c b/net/core/sock.c index 6d7e189e3cd9..f5bb89785e47 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1018,7 +1018,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; -- cgit v1.2.1 From 52900d22288e7d45846037e1db277c665bbc40db Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:40 -0500 Subject: udp: elide zerocopy operation in hot path With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info. Release of its last reference triggers a completion notification. The TCP stack in tcp_sendmsg_locked holds an extra ref independent of the skbs, because it can build, send and free skbs within its loop, possibly reaching refcount zero and freeing the ubuf_info too soon. The UDP stack currently also takes this extra ref, but does not need it as all skbs are sent after return from __ip(6)_append_data. Avoid the extra refcount_inc and refcount_dec_and_test, and generally the sock_zerocopy_put in the common path, by passing the initial reference to the first skb. This approach is taken instead of initializing the refcount to 0, as that would generate error "refcount_t: increment on 0" on the next skb_zcopy_set. Changes v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/skbuff.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1350901c5cb8..c78ce114537e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } -- cgit v1.2.1 From 875e8939953483d856de226b72d14c6a000f9457 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Dec 2018 08:15:10 +0000 Subject: skbuff: Rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark' Commit abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") added the 'offload_mr_fwd_mark' field to indicate that a packet has already undergone L3 multicast routing by a capable device. The field is used to prevent the kernel from forwarding a packet through a netdev through which the device has already forwarded the packet. Currently, no unicast packet is routed by both the device and the kernel, but this is about to change by subsequent patches and we need to be able to mark such packets, so that they will no be forwarded twice. Instead of adding yet another field to 'struct sk_buff', we can just rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark', as a packet either has a multicast or a unicast destination IP. While at it, add a comment about both 'offload_fwd_mark' and 'offload_l3_fwd_mark'. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c78ce114537e..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) -- cgit v1.2.1 From a74f0fa082b76c6a76cba5672f36218518bfdc09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Dec 2018 07:58:17 -0800 Subject: tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 as a step to enable bigger tcp sndbuf limits. It works reasonably well, but the following happens : Once the limit is reached, TCP stack generates an [E]POLLOUT event for every incoming ACK packet. This causes a high number of context switches. This patch implements the strategy David Miller added in sock_def_write_space() : - If TCP socket has a notsent_lowat constraint of X bytes, allow sendmsg() to fill up to X bytes, but send [E]POLLOUT only if number of notsent bytes is below X/2 This considerably reduces TCP_NOTSENT_LOWAT overhead, while allowing to keep the pipe full. Tested: 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM A:/# cat /proc/sys/net/ipv4/tcp_wmem 4096 262144 64000000 A:/# super_netperf 100 -H B -l 1000 -- -K bbr & A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ A:/# vmstat 5 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat A:/# grep TCP /proc/net/sockstat TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow A:/# vmstat 5 5 # check that context switches have not inflated too much. procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/core/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); -- cgit v1.2.1 From 7a35a50df5a34e6eaee1b1f4f913b84879068fee Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 5 Dec 2018 20:02:29 -0800 Subject: neighbor: Add extack messages for add and delete commands Add extack messages for failures in neigh_add and neigh_delete. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..6d479b5562be 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1137,8 +1137,9 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { u8 old; int err; @@ -1155,8 +1156,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1193,8 +1196,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1307,6 +1312,12 @@ out: return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1678,8 +1689,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1707,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1726,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1744,8 +1758,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,16 +1771,21 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1780,8 +1801,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { @@ -1817,8 +1840,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); neigh_release(neigh); out: -- cgit v1.2.1 From 00f54e68924eaf075f3f24be18557899d347bc4a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:36 +0000 Subject: net: core: dev: Add extack argument to dev_open() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. One prominent API through which the notification is invoked is dev_open(). Therefore extend dev_open() with and extra extack argument and update all users. Most of the calls end up just encoding NULL, but bond and team drivers have the extack readily available. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- net/core/netpoll.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 04a6b7100aac..b801c1aafd70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1406,7 +1406,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,7 +1417,7 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..36a2b63ffd6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); -- cgit v1.2.1 From 567c5e13be5cc74d24f5eb54cf353c2e2277189b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:42 +0000 Subject: net: core: dev: Add extack argument to dev_change_flags() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. One prominent API through which the notification is invoked is dev_change_flags(). Therefore extend dev_change_flags() with and extra extack argument and update all users. Most of the calls end up just encoding NULL, but several sites (VLAN, ipvlan, VRF, rtnetlink) do have extack available. Since the function declaration line is changed anyway, name the other function arguments to placate checkpatch. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- net/core/dev_ioctl.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/rtnetlink.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b801c1aafd70..8bba6f98b545 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7595,11 +7595,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..da273ec3cc57 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 98876cd1e36c..4c9e4e187600 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } -- cgit v1.2.1 From 6d0403216d030e5623de3911168fceeaac2e14d6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:43 +0000 Subject: net: core: dev: Add extack argument to __dev_change_flags() In order to pass extack together with NETDEV_PRE_UP notifications, it's necessary to route the extack to __dev_open() from diverse (possibly indirect) callers. The last missing API is __dev_change_flags(). Therefore extend __dev_change_flags() with and extra extack argument and update the two existing users. Since the function declaration line is changed anyway, name the struct net_device argument to placate checkpatch. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- net/core/rtnetlink.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8bba6f98b545..b37e320def13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7498,7 +7498,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7606,7 +7607,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4c9e4e187600..91a0f7477f8e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2871,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } -- cgit v1.2.1 From 263726053400b9c6671df8e87d3db9728199da13 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:45 +0000 Subject: net: core: dev: Add call_netdevice_notifiers_extack() In order to propagate extack through NETDEV_PRE_UP, add a new function call_netdevice_notifiers_extack() that primes the extack field of the notifier info. Convert call_netdevice_notifiers() to a simple wrapper around the new function that passes NULL for extack. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b37e320def13..4b033af8e6cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -162,6 +162,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1734,6 +1737,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1745,11 +1760,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); -- cgit v1.2.1 From 40c900aa1ff580afe941ff77f327f004546f0ce7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 6 Dec 2018 17:05:47 +0000 Subject: net: core: dev: Attach extack to NETDEV_PRE_UP Drivers may need to validate configuration of a device that's about to be upped. Should the validation fail, there's currently no way to communicate details of the failure to the user, beyond an error number. To mend that, change __dev_open() to take an extack argument and pass it from __dev_change_flags() and dev_open(), where it was propagated in the previous patches. Change __dev_open() to call call_netdevice_notifiers_extack() so that the passed-in extack is attached to the NETDEV_PRE_UP notifier. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4b033af8e6cd..068b60db35ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1364,7 +1364,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1380,7 +1380,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1427,7 +1427,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -7547,7 +7547,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { -- cgit v1.2.1 From 58956317c8de52009d1a38a721474c24aef74fe7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 7 Dec 2018 12:24:57 -0800 Subject: neighbor: Improve garbage collection The existing garbage collection algorithm has a number of problems: 1. The gc algorithm will not evict PERMANENT entries as those entries are managed by userspace, yet the existing algorithm walks the entire hash table which means it always considers PERMANENT entries when looking for entries to evict. In some use cases (e.g., EVPN) there can be tens of thousands of PERMANENT entries leading to wasted CPU cycles when gc kicks in. As an example, with 32k permanent entries, neigh_alloc has been observed taking more than 4 msec per invocation. 2. Currently, when the number of neighbor entries hits gc_thresh2 and the last flush for the table was more than 5 seconds ago gc kicks in walks the entire hash table evicting *all* entries not in PERMANENT or REACHABLE state and not marked as externally learned. There is no discriminator on when the neigh entry was created or if it just moved from REACHABLE to another NUD_VALID state (e.g., NUD_STALE). It is possible for entries to be created or for established neighbor entries to be moved to STALE (e.g., an external node sends an ARP request) right before the 5 second window lapses: -----|---------x|----------|----- t-5 t t+5 If that happens those entries are evicted during gc causing unnecessary thrashing on neighbor entries and userspace caches trying to track them. Further, this contradicts the description of gc_thresh2 which says "Entries older than 5 seconds will be cleared". One workaround is to make gc_thresh2 == gc_thresh3 but that negates the whole point of having separate thresholds. 3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries when gc_thresh2 is exceeded is over kill and contributes to trashing especially during startup. This patch addresses these problems as follows: 1. Use of a separate list_head to track entries that can be garbage collected along with a separate counter. PERMANENT entries are not added to this list. The gc_thresh parameters are only compared to the new counter, not the total entries in the table. The forced_gc function is updated to only walk this new gc_list looking for entries to evict. 2. Entries are added to the list head at the tail and removed from the front. 3. Entries are only evicted if they were last updated more than 5 seconds ago, adhering to the original intent of gc_thresh2. 4. Forced gc is stopped once the number of gc_entries drops below gc_thresh2. 5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped when allocating a new neighbor for a PERMANENT entry. By extension this means there are no explicit limits on the number of PERMANENT entries that can be created, but this is no different than FIB entries or FDB entries. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 119 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6d479b5562be..c3b58712e98b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_change_state(struct neighbour *n, u8 new) +{ + bool on_gc_list = !list_empty(&n->gc_list); + bool new_is_perm = new & NUD_PERMANENT; + + n->nud_state = new; + + /* remove from the gc list if new state is permanent; + * add to the gc list if new state is not permanent + */ + if (new_is_perm && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!new_is_perm && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +} static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) @@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + u8 flags = NTF_EXT_LEARNED; + struct neighbour *n, *tmp; + u8 state = NUD_PERMANENT; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if (!(n->nud_state & state) && !(n->flags & flags) && + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool permanent) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (permanent) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + else + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!permanent) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool permanent, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -591,6 +633,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1167,7 +1215,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + neigh_change_state(neigh, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1246,7 +1294,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + neigh_change_state(neigh, new); notify = 1; } @@ -1582,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1813,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_state & NUD_PERMANENT, + true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -2654,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); -- cgit v1.2.1 From 0fbe82e628c817e292ff588cd5847fc935e025f2 Mon Sep 17 00:00:00 2001 From: yupeng Date: Wed, 5 Dec 2018 18:56:28 -0800 Subject: net: call sk_dst_reset when set SO_DONTROUTE after set SO_DONTROUTE to 1, the IP layer should not route packets if the dest IP address is not in link scope. But if the socket has cached the dst_entry, such packets would be routed until the sk_dst_cache expires. So we should clean the sk_dst_cache when a user set SO_DONTROUTE option. Below are server/client python scripts which could reprodue this issue: server side code: ========================================================================== import socket import struct import time s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('0.0.0.0', 9000)) s.listen(1) sock, addr = s.accept() sock.setsockopt(socket.SOL_SOCKET, socket.SO_DONTROUTE, struct.pack('i', 1)) while True: sock.send(b'foo') time.sleep(1) ========================================================================== client side code: ========================================================================== import socket import time s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('server_address', 9000)) while True: data = s.recv(1024) print(data) ========================================================================== Signed-off-by: yupeng Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index f5bb89785e47..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -700,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); -- cgit v1.2.1 From 8cc196d6ef86bbab01dbf16f6c596cf56aa0839e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 10 Dec 2018 13:54:07 -0800 Subject: neighbor: gc_list changes should be protected by table lock Adding and removing neighbor entries to / from the gc_list need to be done while holding the table lock; a couple of places were missed in the original patch. Move the list_add_tail in neigh_alloc to ___neigh_create where the lock is already obtained. Since neighbor entries should rarely be moved to/from PERMANENT state, add lock/unlock around the gc_list changes in neigh_change_state rather than extending the lock hold around all neighbor updates. Fixes: 58956317c8de ("neighbor: Improve garbage collection") Reported-by: Andrei Vagin Reported-by: syzbot+6cc2fd1d3bdd2e007363@syzkaller.appspotmail.com Reported-by: syzbot+35e87b87c00f386b041f@syzkaller.appspotmail.com Reported-by: syzbot+b354d1fb59091ea73c37@syzkaller.appspotmail.com Reported-by: syzbot+3ddead5619658537909b@syzkaller.appspotmail.com Reported-by: syzbot+424d47d5c456ce8b2bbe@syzkaller.appspotmail.com Reported-by: syzbot+e4d42eb35f6a27b0a628@syzkaller.appspotmail.com Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c3b58712e98b..03fdc5ae66b0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -138,11 +138,17 @@ static void neigh_change_state(struct neighbour *n, u8 new) * add to the gc list if new state is not permanent */ if (new_is_perm && on_gc_list) { + write_lock_bh(&n->tbl->lock); list_del_init(&n->gc_list); + write_unlock_bh(&n->tbl->lock); + atomic_dec(&n->tbl->gc_entries); } else if (!new_is_perm && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ + write_lock_bh(&n->tbl->lock); list_add_tail(&n->gc_list, &n->tbl->gc_list); + write_unlock_bh(&n->tbl->lock); + atomic_inc(&n->tbl->gc_entries); } } @@ -390,11 +396,7 @@ do_alloc: n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; - - if (!permanent) - list_add_tail(&n->gc_list, &n->tbl->gc_list); - else - INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->gc_list); atomic_inc(&tbl->entries); out: @@ -616,6 +618,9 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, } n->dead = 0; + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, -- cgit v1.2.1 From 2fd527b72bb6f95dfe8a1902e998cb76390c431e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 12 Dec 2018 17:02:48 +0000 Subject: net: ndo_bridge_setlink: Add extack Drivers may not be able to implement a VLAN addition or reconfiguration. In those cases it's desirable to explain to the user that it was rejected (and why). To that end, add extack argument to ndo_bridge_setlink. Adapt all users to that change. Following patches will use the new argument in the bridge driver. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c9c0407a7ee0..3b6e551f9e69 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4332,7 +4332,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -4344,7 +4345,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, - flags); + flags, + extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; -- cgit v1.2.1 From 3a37a9636cf3a1af2621a33f7eef8a2a3da81030 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:30 +0000 Subject: net: dev: Add extack argument to dev_set_mac_address() A follow-up patch will add a notifier type NETDEV_PRE_CHANGEADDR, which allows vetoing of MAC address changes. One prominent path to that notification is through dev_set_mac_address(). Therefore give this function an extack argument, so that it can be packed together with the notification. Thus a textual reason for rejection (or a warning) can be communicated back to the user. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- net/core/dev_ioctl.c | 2 +- net/core/rtnetlink.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 754284873355..7250a3a73fa4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7759,10 +7759,12 @@ EXPORT_SYMBOL(dev_set_group); * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address + * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index da273ec3cc57..31380fd5a4e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3b6e551f9e69..f8bdb8adab2c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa, extack); kfree(sa); if (err) goto errout; -- cgit v1.2.1 From 1570415f0810fce085066fb39827397452c3965a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:33 +0000 Subject: net: dev: Add NETDEV_PRE_CHANGEADDR The NETDEV_CHANGEADDR notification is emitted after a device address changes. Extending this message to allow vetoing is certainly possible, but several other notification types have instead adopted a simple two-stage approach: first a "pre" notification is sent to make sure all interested parties are OK with a change that's about to be done. Then the change is done, and afterwards a "post" notification is sent. This dual approach is easier to use: when the change is vetoed, nothing has changed yet, and it's therefore unnecessary to roll anything back. Therefore adopt it for NETDEV_CHANGEADDR as well. To that end, add NETDEV_PRE_CHANGEADDR and an info structure to go along with it. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7250a3a73fa4..01497b7d1bdf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1589,6 +1589,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) } #undef N return "UNKNOWN_NETDEV_EVENT"; -- cgit v1.2.1 From d59cdf9475ad84d1f57cab1d162cf289702cfb15 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 13 Dec 2018 11:54:35 +0000 Subject: net: dev: Issue NETDEV_PRE_CHANGEADDR When a device address is about to be changed, or an address added to the list of device HW addresses, it is necessary to ensure that all interested parties can support the address. Therefore, send the NETDEV_PRE_CHANGEADDR notification, and if anyone bails on it, do not change the address. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++++++++ net/core/dev_addr_lists.c | 3 +++ 2 files changed, 27 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 01497b7d1bdf..ed9aa4a91f1f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7756,6 +7756,27 @@ void dev_set_group(struct net_device *dev, int new_group) } EXPORT_SYMBOL(dev_set_group); +/** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + /** * dev_set_mac_address - Change Media Access Control Address * @dev: device @@ -7776,6 +7797,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 81a8cd4ea3bd..a6723b306717 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -498,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -- cgit v1.2.1 From 9c29a2f55ec05cc8b525ee3b2d75d3cd37911123 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:21 -0700 Subject: neighbor: Fix locking order for gc_list changes Lock checker noted an inverted lock order between neigh_change_state (neighbor lock then table lock) and neigh_periodic_work (table lock and then neighbor lock) resulting in: [ 121.057652] ====================================================== [ 121.058740] WARNING: possible circular locking dependency detected [ 121.059861] 4.20.0-rc6+ #43 Not tainted [ 121.060546] ------------------------------------------------------ [ 121.061630] kworker/0:2/65 is trying to acquire lock: [ 121.062519] (____ptrval____) (&n->lock){++--}, at: neigh_periodic_work+0x237/0x324 [ 121.063894] [ 121.063894] but task is already holding lock: [ 121.064920] (____ptrval____) (&tbl->lock){+.-.}, at: neigh_periodic_work+0x194/0x324 [ 121.066274] [ 121.066274] which lock already depends on the new lock. [ 121.066274] [ 121.067693] [ 121.067693] the existing dependency chain (in reverse order) is: ... Fix by renaming neigh_change_state to neigh_update_gc_list, changing it to only manage whether an entry should be on the gc_list and taking locks in the same order as neigh_periodic_work. Invoke at the end of neigh_update only if diff between old or new states has the PERMANENT flag set. Fixes: 8cc196d6ef86 ("neighbor: gc_list changes should be protected by table lock") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 03fdc5ae66b0..010784123bc1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -127,30 +127,30 @@ static void neigh_mark_dead(struct neighbour *n) } } -static void neigh_change_state(struct neighbour *n, u8 new) +static void neigh_update_gc_list(struct neighbour *n) { - bool on_gc_list = !list_empty(&n->gc_list); - bool new_is_perm = new & NUD_PERMANENT; + bool on_gc_list, new_is_perm; - n->nud_state = new; + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); /* remove from the gc list if new state is permanent; * add to the gc list if new state is not permanent */ + new_is_perm = n->nud_state & NUD_PERMANENT; + on_gc_list = !list_empty(&n->gc_list); + if (new_is_perm && on_gc_list) { - write_lock_bh(&n->tbl->lock); list_del_init(&n->gc_list); - write_unlock_bh(&n->tbl->lock); - atomic_dec(&n->tbl->gc_entries); } else if (!new_is_perm && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ - write_lock_bh(&n->tbl->lock); list_add_tail(&n->gc_list, &n->tbl->gc_list); - write_unlock_bh(&n->tbl->lock); - atomic_inc(&n->tbl->gc_entries); } + + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); } static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, @@ -1220,7 +1220,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh_change_state(neigh, new); + neigh->nud_state = new; err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1299,7 +1299,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh_change_state(neigh, new); + neigh->nud_state = new; notify = 1; } @@ -1360,6 +1360,9 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); + if ((new ^ old) & NUD_PERMANENT) + neigh_update_gc_list(neigh); + if (notify) neigh_update_notify(neigh, nlmsg_pid); -- cgit v1.2.1 From 758a7f0b32ab890831d321145c3fe72bb85c0350 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:22 -0700 Subject: neighbor: Fix state check in neigh_forced_gc PERMANENT entries are not on the gc_list so the state check is now redundant. Also, the move to not purge entries until after 5 seconds should not apply to FAILED entries; those can be removed immediately to make way for newer ones. This restores the previous logic prior to the gc_list. Fixes: 58956317c8de ("neighbor: Improve garbage collection") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 010784123bc1..acaa1a64150d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -204,7 +204,6 @@ static int neigh_forced_gc(struct neigh_table *tbl) unsigned long tref = jiffies - 5 * HZ; u8 flags = NTF_EXT_LEARNED; struct neighbour *n, *tmp; - u8 state = NUD_PERMANENT; int shrunk = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); @@ -216,8 +215,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) bool remove = false; write_lock(&n->lock); - if (!(n->nud_state & state) && !(n->flags & flags) && - time_after(tref, n->updated)) + if ((n->nud_state == NUD_FAILED) || + (!(n->flags & flags) && time_after(tref, n->updated))) remove = true; write_unlock(&n->lock); -- cgit v1.2.1 From 7e6f182bec7debb420a2c12ae0ea1813645a7ac4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:23 -0700 Subject: neighbor: Remove state and flags arguments to neigh_del neigh_del now only has 1 caller, and the state and flags arguments are both 0. Remove them and simplify neigh_del. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index acaa1a64150d..bb6f9ca7a3ce 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,14 +153,13 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } -static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, - struct neighbour __rcu **np, struct neigh_table *tbl) +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && - !(n->flags & flags)) { + if (refcount_read(&n->refcnt) == 1) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, @@ -192,7 +191,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, 0, np, tbl); + return neigh_del(n, np, tbl); np = &n->next; } return false; -- cgit v1.2.1 From 526f1b587cf826d78c3e522428ce6b24a8da0d65 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:24 -0700 Subject: neighbor: Move neigh_update_ext_learned to core file neigh_update_ext_learned has one caller in neighbour.c so does not need to be defined in the header. Move it and in the process remove the intialization of ndm_flags and just set it based on the flags check. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bb6f9ca7a3ce..2401040f799b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,6 +153,24 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } +static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + u8 ndm_flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + } +} + static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, struct neigh_table *tbl) { -- cgit v1.2.1 From e997f8a20a57cae16ed0c7a2bff6d3ab75f58123 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 11 Dec 2018 18:57:25 -0700 Subject: neighbor: Remove externally learned entries from gc_list Externally learned entries are similar to PERMANENT entries in the sense they are managed by userspace and can not be garbage collected. As such remove them from the gc_list, remove the flags check from neigh_forced_gc and skip threshold checks in neigh_alloc. As with PERMANENT entries, this allows unlimited number of NTF_EXT_LEARNED entries. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2401040f799b..42b413774370 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -129,21 +129,22 @@ static void neigh_mark_dead(struct neighbour *n) static void neigh_update_gc_list(struct neighbour *n) { - bool on_gc_list, new_is_perm; + bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); - /* remove from the gc list if new state is permanent; - * add to the gc list if new state is not permanent + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list */ - new_is_perm = n->nud_state & NUD_PERMANENT; + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; on_gc_list = !list_empty(&n->gc_list); - if (new_is_perm && on_gc_list) { + if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); - } else if (!new_is_perm && !on_gc_list) { + } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); @@ -153,13 +154,14 @@ static void neigh_update_gc_list(struct neighbour *n) write_unlock_bh(&n->tbl->lock); } -static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, +static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, int *notify) { + bool rc = false; u8 ndm_flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return; + return rc; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { @@ -167,8 +169,11 @@ static void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; + rc = true; *notify = 1; } + + return rc; } static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, @@ -219,7 +224,6 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; unsigned long tref = jiffies - 5 * HZ; - u8 flags = NTF_EXT_LEARNED; struct neighbour *n, *tmp; int shrunk = 0; @@ -233,7 +237,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || - (!(n->flags & flags) && time_after(tref, n->updated))) + time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); @@ -371,13 +375,13 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool permanent) + bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - if (permanent) + if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; @@ -419,7 +423,7 @@ out: return n; out_entries: - if (!permanent) + if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } @@ -566,9 +570,9 @@ EXPORT_SYMBOL(neigh_lookup_nodev); static struct neighbour *___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, - bool permanent, bool want_ref) + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); u32 hash_val; unsigned int key_len = tbl->key_len; int error; @@ -634,7 +638,7 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, } n->dead = 0; - if (!permanent) + if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (want_ref) @@ -1210,6 +1214,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { + bool ext_learn_change = false; u8 old; int err; int notify = 0; @@ -1230,7 +1235,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, goto out; } - neigh_update_ext_learned(neigh, flags, ¬ify); + ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1376,7 +1381,7 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - if ((new ^ old) & NUD_PERMANENT) + if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) neigh_update_gc_list(neigh); if (notify) @@ -1881,14 +1886,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { + bool exempt_from_gc; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } - neigh = ___neigh_create(tbl, dst, dev, - ndm->ndm_state & NUD_PERMANENT, - true); + exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || + ndm->ndm_flags & NTF_EXT_LEARNED; + neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; -- cgit v1.2.1 From aaa5d90b395a72faff797b00d815165ee0e664c0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 14 Dec 2018 11:51:58 +0100 Subject: net: use indirect call wrappers at GRO network layer This avoids an indirect calls for L3 GRO receive path, both for ipv4 and ipv6, if the latter is not compiled as a module. Note that when IPv6 is compiled as builtin, it will be checked first, so we have a single additional compare for the more common path. v1 -> v2: - adapted to INDIRECT_CALL_ changes Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ed9aa4a91f1f..1b5a4410be0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -5338,6 +5339,8 @@ static void flush_all_backlogs(void) put_online_cpus(); } +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; @@ -5357,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb, 0); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); break; } rcu_read_unlock(); @@ -5504,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head) napi_gro_complete(oldest); } +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5553,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(gro_head, skb); + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + gro_head, skb); break; } rcu_read_unlock(); -- cgit v1.2.1 From df9b0e30d44c901ac27c0f38cd54511b3f130c6d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 15 Dec 2018 14:09:06 -0800 Subject: neighbor: Add protocol attribute Similar to routes and rules, add protocol attribute to neighbor entries for easier tracking of how each was created. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 42b413774370..fb4372cb1de1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1828,6 +1828,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; + u8 protocol = 0; int err; ASSERT_RTNL(); @@ -1867,6 +1868,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (tb[NDA_PROTOCOL]) { + if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) { + NL_SET_ERR_MSG(extack, "Invalid protocol attribute"); + goto out; + } + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + } + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -1874,6 +1883,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; + if (protocol) + pn->protocol = protocol; err = 0; } goto out; @@ -1924,6 +1935,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } else err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); + + if (protocol) + neigh->protocol = protocol; + neigh_release(neigh); out: @@ -2417,6 +2432,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2448,6 +2466,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -3103,7 +3124,8 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ } static void __neigh_notify(struct neighbour *n, int type, int flags, -- cgit v1.2.1 From 5b2f94b27622d5b92d1cebf4bb5a627db4444607 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 15 Dec 2018 22:35:08 -0800 Subject: net: rtnetlink: support for fdb get This patch adds support for fdb get similar to route get. arguments can be any of the following (similar to fdb add/del/dump): [bridge, mac, vlan] or [bridge_port, mac, vlan, flags=[NTF_MASTER]] or [dev, mac, [vni|vlan], flags=[NTF_SELF]] Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f8bdb8adab2c..baf2685b4da2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3460,6 +3460,18 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, new_nsid, new_ifindex); } +static const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, +}; + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -4021,6 +4033,160 @@ out: return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { @@ -5081,7 +5247,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); -- cgit v1.2.1 From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 16 Dec 2018 15:47:04 -0800 Subject: bpf: sockmap, metadata support for reporting size of msg This adds metadata to sk_msg_md for BPF programs to read the sk_msg size. When the SK_MSG program is running under an application that is using sendfile the data is not copied into sk_msg buffers by default. Rather the BPF program uses sk_msg_pull_data to read the bytes in. This avoids doing the costly memcopy instructions when they are not in fact needed. However, if we don't know the size of the sk_msg we have to guess if needed bytes are available by doing a pull request which may fail. By including the size of the sk_msg BPF programs can check the size before issuing sk_msg_pull_data requests. Additionally, the same applies for sendmsg calls when the application provides multiple iovs. Here the BPF program needs to pull in data to update data pointers but its not clear where the data ends without a size parameter. In many cases "guessing" is not easy to do and results in multiple calls to pull and without bounded loops everything gets fairly tricky. Clean this up by including a u32 size field. Note, all writes into sk_msg_md are rejected already from sk_msg_is_valid_access so nothing additional is needed there. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f9348806e843..3a3b21726fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7530,6 +7530,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; -- cgit v1.2.1 From df5042f4c5b9326c593bf2e31ed859ebc3b4130a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:16 +0100 Subject: sk_buff: add skb extension infrastructure This adds an optional extension infrastructure, with ispec (xfrm) and bridge netfilter as first users. objdiff shows no changes if kernel is built without xfrm and br_netfilter support. The third (planned future) user is Multipath TCP which is still out-of-tree. MPTCP needs to map logical mptcp sequence numbers to the tcp sequence numbers used by individual subflows. This DSS mapping is read/written from tcp option space on receive and written to tcp option space on transmitted tcp packets that are part of and MPTCP connection. Extending skb_shared_info or adding a private data field to skb fclones doesn't work for incoming skb, so a different DSS propagation method would be required for the receive side. mptcp has same requirements as secpath/bridge netfilter: 1. extension memory is released when the sk_buff is free'd. 2. data is shared after cloning an skb (clone inherits extension) 3. adding extension to an skb will COW the extension buffer if needed. The "MPTCP upstreaming" effort adds SKB_EXT_MPTCP extension to store the mapping for tx and rx processing. Two new members are added to sk_buff: 1. 'active_extensions' byte (filling a hole), telling which extensions are available for this skb. This has two purposes. a) avoids the need to initialize the pointer. b) allows to "delete" an extension by clearing its bit value in ->active_extensions. While it would be possible to store the active_extensions byte in the extension struct instead of sk_buff, there is one problem with this: When an extension has to be disabled, we can always clear the bit in skb->active_extensions. But in case it would be stored in the extension buffer itself, we might have to COW it first, if we are dealing with a cloned skb. On kmalloc failure we would be unable to turn an extension off. 2. extension pointer, located at the end of the sk_buff. If the active_extensions byte is 0, the pointer is undefined, it is not initialized on skb allocation. This adds extra code to skb clone and free paths (to deal with refcount/free of extension area) but this replaces similar code that manages skb->nf_bridge and skb->sp structs in the followup patches of the series. It is possible to add support for extensions that are not preseved on clones/copies. To do this, it would be needed to define a bitmask of all extensions that need copy/cow semantics, and change __skb_ext_copy() to check ->active_extensions & SKB_EXT_PRESERVE_ON_CLONE, then just set ->active_extensions to 0 on the new clone. This isn't done here because all extensions that get added here need the copy/cow semantics. v2: Allocate entire extension space using kmem_cache. Upside is that this allows better tracking of used memory, downside is that we will allocate more space than strictly needed in most cases (its unlikely that all extensions are active/needed at same time for same skb). The allocated memory (except the small extension header) is not cleared, so no additonal overhead aside from memory usage. Avoid atomic_dec_and_test operation on skb_ext_put() by using similar trick as kfree_skbmem() does with fclone_ref: If recount is 1, there is no concurrent user and we can free right away. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 40552547c69a..d2dfad33e686 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -617,6 +620,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,6 +800,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); + __skb_ext_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -3902,6 +3907,40 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3916,6 +3955,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -5554,3 +5594,118 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old); + if (!new) + return NULL; + + if (__skb_ext_exist(old, id)) { + if (old != new) + skb->extensions = new; + goto set_active; + } + + newoff = old->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; + skb->extensions = new; +set_active: + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ -- cgit v1.2.1 From de8bda1d22d38b7d5cd08b33f86efd94d4c86630 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:17 +0100 Subject: net: convert bridge_nf to use skb extension infrastructure This converts the bridge netfilter (calling iptables hooks from bridge) facility to use the extension infrastructure. The bridge_nf specific hooks in skb clone and free paths are removed, they have been replaced by the skb_ext hooks that do the same as the bridge nf allocations hooks did. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d2dfad33e686..0c65723591d7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -616,9 +616,6 @@ void skb_release_head_state(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); #endif skb_ext_put(skb); } -- cgit v1.2.1 From 4165079ba328dd47262a2183049d3591f0a750b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:27 +0100 Subject: net: switch secpath to use skb extension infrastructure Remove skb->sp and allocate secpath storage via extension infrastructure. This also reduces sk_buff by 8 bytes on x86_64. Total size of allyesconfig kernel is reduced slightly, as there is less inlined code (one conditional atomic op instead of two on skb_clone). No differences in throughput in following ipsec performance tests: - transport mode with aes on 10GB link - tunnel mode between two network namespaces with aes and null cipher Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c65723591d7..cb0bf4215745 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -609,7 +609,6 @@ fastpath: void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -798,9 +797,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -3912,6 +3908,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -3919,6 +3918,9 @@ static __always_inline unsigned int skb_ext_total_length(void) return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + #endif 0; } @@ -5610,7 +5612,8 @@ static struct skb_ext *skb_ext_alloc(void) return new; } -static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) { struct skb_ext *new; @@ -5624,6 +5627,15 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif __skb_ext_put(old); return new; } @@ -5650,7 +5662,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions) { old = skb->extensions; - new = skb_ext_maybe_cow(old); + new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; @@ -5679,6 +5691,16 @@ set_active: } EXPORT_SYMBOL(skb_ext_add); +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -5687,6 +5709,14 @@ void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif } } EXPORT_SYMBOL(__skb_ext_del); @@ -5702,6 +5732,11 @@ void __skb_ext_put(struct skb_ext *ext) if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); -- cgit v1.2.1 From 82cbb5c631a07b3aa6df6eab644d55da9de5a645 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 19 Dec 2018 12:51:38 -0800 Subject: neighbour: register rtnl doit handler this patch registers neigh doit handler. The doit handler returns a neigh entry given dst and dev. This is similar to route and fdb doit (get) handlers. Also moves nda_policy declaration from rtnetlink.c to neighbour.c Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++++--- net/core/rtnetlink.c | 12 --- 2 files changed, 193 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fb4372cb1de1..43687c9abe1d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1751,6 +1751,18 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, +}; + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2711,6 +2723,186 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, + void **dst, int *dev_idx, u8 *ndm_flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *dev_idx = ndm->ndm_ifindex; + *tbl = neigh_find_table(ndm->ndm_family); + if (*tbl == NULL) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + return -EAFNOSUPPORT; + } + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_DST: + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + return -EINVAL; + } + *dst = nla_data(tb[i]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return -EINVAL; + } + } + + return 0; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get_reply(struct net *net, struct neighbour *neigh, + u32 pid, u32 seq) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN); /* NDA_DST */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, + u32 pid, u32 seq, struct neigh_table *tbl) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct net_device *dev = NULL; + struct neigh_table *tbl = NULL; + struct neighbour *neigh; + void *dst = NULL; + u8 ndm_flags = 0; + int dev_idx = 0; + int err; + + err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, + extack); + if (err < 0) + return err; + + if (dev_idx) { + dev = __dev_get_by_index(net, dev_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (!dst) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + return -EINVAL; + } + + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev, 0); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + return -ENOENT; + } + return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, tbl); + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -EINVAL; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + return -ENOENT; + } + + err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq); + + neigh_release(neigh); + + return err; +} + void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; @@ -3118,16 +3310,6 @@ static const struct seq_operations neigh_stat_seq_ops = { }; #endif /* CONFIG_PROC_FS */ -static inline size_t neigh_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ - + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4) /* NDA_PROBES */ - + nla_total_size(1); /* NDA_PROTOCOL */ -} - static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { @@ -3511,7 +3693,7 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index baf2685b4da2..48f61885fd6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3460,18 +3460,6 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, new_nsid, new_ifindex); } -static const struct nla_policy nda_policy[NDA_MAX+1] = { - [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, - [NDA_PROBES] = { .type = NLA_U32 }, - [NDA_VLAN] = { .type = NLA_U16 }, - [NDA_PORT] = { .type = NLA_U16 }, - [NDA_VNI] = { .type = NLA_U32 }, - [NDA_IFINDEX] = { .type = NLA_U32 }, - [NDA_MASTER] = { .type = NLA_U32 }, -}; - static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, -- cgit v1.2.1 From 754d5da63145852736f34cfc762164f5d8d6537b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 15:53:22 -0800 Subject: neighbor: Initialize protocol when new pneigh_entry are created pneigh_lookup uses kmalloc versus kzalloc when new entries are allocated. Given that the newly added protocol field needs to be initialized. Fixes: df9b0e30d44c ("neighbor: Add protocol attribute") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 43687c9abe1d..d9fa101b0e41 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -725,6 +725,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; + n->protocol = 0; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; -- cgit v1.2.1 From a9cd3439e3c6d777a05c63b4d06c3500d1d4074e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 20:02:36 -0800 Subject: neighbor: Use nda_policy for validating attributes in adds and dump requests Add NDA_PROTOCOL to nda_policy and use the policy for attribute parsing and validation for adding neighbors and in dump requests. Remove the now duplicate checks on nla_len. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d9fa101b0e41..8baa9ab01db6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1762,6 +1762,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1845,7 +1846,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; @@ -1881,13 +1882,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (tb[NDA_PROTOCOL]) { - if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) { - NL_SET_ERR_MSG(extack, "Invalid protocol attribute"); - goto out; - } + if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); - } if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -2639,10 +2635,10 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, } err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } else { err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } if (err < 0) return err; @@ -2654,17 +2650,9 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); - return -EINVAL; - } filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); - return -EINVAL; - } filter->master_idx = nla_get_u32(tb[i]); break; default: -- cgit v1.2.1 From bc1b4f013b5029b4c8b63fb9ba8d084119486d7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:30 -0800 Subject: bpf: sk_msg, improve offset chk in _is_valid_access The check for max offset in sk_msg_is_valid_access uses sizeof() which is incorrect because it would allow accessing possibly past the end of the struct in the padded case. Further, it doesn't preclude accessing any padding that may be added in the middle of a struct. All told this makes it fragile to rely on. To fix this explicitly check offsets with fields using the bpf_ctx_range() and bpf_ctx_range_till() macros. For reference the current structure layout looks as follows (reported by pahole) struct sk_msg_md { union { void * data; /* 8 */ }; /* 0 8 */ union { void * data_end; /* 8 */ }; /* 8 8 */ __u32 family; /* 16 4 */ __u32 remote_ip4; /* 20 4 */ __u32 local_ip4; /* 24 4 */ __u32 remote_ip6[4]; /* 28 16 */ __u32 local_ip6[4]; /* 44 16 */ __u32 remote_port; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 local_port; /* 64 4 */ __u32 size; /* 68 4 */ /* size: 72, cachelines: 2, members: 10 */ /* last cacheline: 8 bytes */ }; So there should be no padding at the moment but fixing this now prevents future errors. Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 3a3b21726fb5..6bd9f08f6162 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) + break; + default: return false; - + } return true; } -- cgit v1.2.1 From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6bd9f08f6162..447dd1bad31f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7425,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), -- cgit v1.2.1 From 51199405f967207de372d9b60989eb87d7ae8809 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:32 -0800 Subject: bpf: skb_verdict, support SK_PASS on RX BPF path Add SK_PASS verdict support to SK_SKB_VERDICT programs. Now that support for redirects exists we can implement SK_PASS as a redirect to the same socket. This simplifies the BPF programs and avoids an extra map lookup on RX path for simple visibility cases. Further, reduces user (BPF programmer in this context) confusion when their program drops skb due to lack of support. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa0..8a91a460de8f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,6 +669,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, bool ingress; switch (verdict) { + case __SK_PASS: + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + goto out_free; + } + if (atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf) { + struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); + + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; + } + goto out_free; case __SK_REDIRECT: sk_other = tcp_skb_bpf_redirect_fetch(skb); if (unlikely(!sk_other)) -- cgit v1.2.1 From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:33 -0800 Subject: bpf: sk_msg, fix socket data_ready events When a skb verdict program is in-use and either another BPF program redirects to that socket or the new SK_PASS support is used the data_ready callback does not wake up application. Instead because the stream parser/verdict is using the sk data_ready callback we wake up the stream parser/verdict block. Fix this by adding a helper to check if the stream parser block is enabled on the sk and if so call the saved pointer which is the upper layers wake up function. This fixes application stalls observed when an application is waiting for data in a blocking read(). Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8a91a460de8f..3df7627db4bb 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -751,7 +751,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -799,7 +799,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } -- cgit v1.2.1 From a136678c0bdbb650daff5df5eec1dab960e074a7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:34 -0800 Subject: bpf: sk_msg, zap ingress queue on psock down In addition to releasing any cork'ed data on a psock when the psock is removed we should also release any skb's in the ingress work queue. Otherwise the skb's eventually get free'd but late in the tear down process so we see the WARNING due to non-zero sk_forward_alloc. void sk_stream_kill_queues(struct sock *sk) { ... WARN_ON(sk->sk_forward_alloc); ... } Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3df7627db4bb..86c9726fced8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); -- cgit v1.2.1 From 463561e6b9facf93ef90b65d2c75a80c7262d778 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 20 Dec 2018 16:50:50 +0000 Subject: neighbour: remove stray semicolon Currently the stray semicolon means that the final term in the addition is being missed. Fix this by removing it. Cleans up clang warning: net/core/neighbour.c:2821:9: warning: expression result unused [-Wunused-value] Fixes: 82cbb5c631a0 ("neighbour: register rtnl doit handler") Signed-off-by: Colin Ian King Acked-By: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fa384f775f1a..763a7b08df67 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2811,7 +2811,7 @@ errout: static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN); /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(1); /* NDA_PROTOCOL */ } -- cgit v1.2.1 From e94e50bd88f7ed2f2d40c32c06efd61c36c33ec8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Dec 2018 19:03:13 +0100 Subject: net: fix possible user-after-free in skb_ext_add() On cow we can free the old extension: we must avoid dereferencing such extension after skb_ext_maybe_cow(). Since 'new' contents are always equal to 'old' after the copy, we can fix the above accessing the relevant data using 'new'. Fixes: df5042f4c5b9 ("sk_buff: add skb extension infrastructure") Signed-off-by: Paolo Abeni Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cb0bf4215745..e1d88762f659 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5666,13 +5666,13 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (!new) return NULL; - if (__skb_ext_exist(old, id)) { + if (__skb_ext_exist(new, id)) { if (old != new) skb->extensions = new; goto set_active; } - newoff = old->chunks; + newoff = new->chunks; } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); -- cgit v1.2.1 From 682ec859518d73435cc924d816da2953343241c1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Dec 2018 19:03:15 +0100 Subject: net: minor cleanup in skb_ext_add() When the extension to be added is already present, the only skb field we may need to update is 'extensions': we can reorder the code and avoid a branch. v1 -> v2: - be sure to flag the newly added extension as active Signed-off-by: Paolo Abeni Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e1d88762f659..37317ffec146 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5666,11 +5666,8 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (!new) return NULL; - if (__skb_ext_exist(new, id)) { - if (old != new) - skb->extensions = new; + if (__skb_ext_exist(new, id)) goto set_active; - } newoff = new->chunks; } else { @@ -5684,8 +5681,8 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) newlen = newoff + skb_ext_type_len[id]; new->chunks = newlen; new->offset[id] = newoff; - skb->extensions = new; set_active: + skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); } -- cgit v1.2.1 From 50d5258634aee2e62832aa086d2fb0de00e72b91 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 21 Dec 2018 14:49:01 -0600 Subject: net: core: Fix Spectre v1 vulnerability flen is indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: net/core/filter.c:1101 bpf_check_classic() warn: potential spectre issue 'filter' [w] Fix this by sanitizing flen before using it to index filter at line 1101: switch (filter[flen - 1].code) { and through pc at line 1040: const struct sock_filter *ftest = &filter[pc]; Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 8d2c629501e2..0c74c2f9776a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1038,6 +1039,7 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; + flen = array_index_nospec(flen, BPF_MAXINSNS + 1); /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; -- cgit v1.2.1 From f2ab95814103314af3239d322e382c61c69a788d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 23 Dec 2018 16:01:35 -0800 Subject: net: Revert recent Spectre-v1 patches. This reverts: 50d5258634ae ("net: core: Fix Spectre v1 vulnerability") d686026b1e6e ("phonet: af_phonet: Fix Spectre v1 vulnerability") a95386f0390a ("nfc: af_nfc: Fix Spectre v1 vulnerability") a3ac5817ffe8 ("can: af_can: Fix Spectre v1 vulnerability") After some discussion with Alexei Starovoitov these all seem to be completely unnecessary. Signed-off-by: David S. Miller --- net/core/filter.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 0c74c2f9776a..8d2c629501e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,7 +73,6 @@ #include #include #include -#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1039,7 +1038,6 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; - flen = array_index_nospec(flen, BPF_MAXINSNS + 1); /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; -- cgit v1.2.1 From 0eb987c874dc93f9c9d85a6465dbde20fdd3884c Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Sun, 23 Dec 2018 19:42:38 -0600 Subject: net/net_namespace: Check the return value of register_pernet_subsys() In net_ns_init(), register_pernet_subsys() could fail while registering network namespace subsystems. The fix checks the return value and sends a panic() on failure. Signed-off-by: Aditya Pakki Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..af8849a7a9c3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -917,7 +917,8 @@ static int __init net_ns_init(void) init_net_initialized = true; up_write(&pernet_ops_rwsem); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); -- cgit v1.2.1