diff options
Diffstat (limited to 'net/ipv4')
57 files changed, 1074 insertions, 580 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index aff93e7cdb31..52bdb881a506 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -58,11 +59,6 @@ * Some other random speedups. * Cyrus Durgin : Cleaned up file for kmod hacks. * Andi Kleen : Fix inet_stream_connect TCP race. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv4: " fmt diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 850a6f13a082..05eb42f347e8 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche @@ -7,11 +8,6 @@ * high-level addresses) into a low-level hardware address (like an Ethernet * address). * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 300921417f89..7bd29e694603 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET implementation * * Authors: * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 701c5d113a34..914ccc7f192a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -194,7 +190,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); @@ -300,8 +297,8 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { - struct in_ifaddr *ifa; struct net_device *dev; + struct in_ifaddr *ifa; ASSERT_RTNL(); @@ -311,7 +308,7 @@ static void inetdev_destroy(struct in_device *in_dev) ip_mc_destroy_dev(in_dev); - while ((ifa = in_dev->ifa_list) != NULL) { + while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } @@ -327,30 +324,35 @@ static void inetdev_destroy(struct in_device *in_dev) int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { + const struct in_ifaddr *ifa; + rcu_read_lock(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } - } endfor_ifa(in_dev); + } rcu_read_unlock(); return 0; } -static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 portid) +static void __inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa, *ifa1 = *ifap; - struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *ifa, *ifa1; + struct in_ifaddr *last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); + ifa1 = rtnl_dereference(*ifap); + last_prim = rtnl_dereference(in_dev->ifa_list); if (in_dev->dead) goto no_promotions; @@ -359,9 +361,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr **ifap1 = &ifa1->ifa_next; + struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; - while ((ifa = *ifap1) != NULL) { + while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -394,7 +396,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ - for (ifa = promote; ifa; ifa = ifa->ifa_next) { + for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); @@ -420,19 +422,24 @@ no_promotions: blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { - struct in_ifaddr *next_sec = promote->ifa_next; + struct in_ifaddr *next_sec; + next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { - prev_prom->ifa_next = promote->ifa_next; - promote->ifa_next = last_prim->ifa_next; - last_prim->ifa_next = promote; + struct in_ifaddr *last_sec; + + last_sec = rtnl_dereference(last_prim->ifa_next); + rcu_assign_pointer(prev_prom->ifa_next, next_sec); + rcu_assign_pointer(promote->ifa_next, last_sec); + rcu_assign_pointer(last_prim->ifa_next, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; + ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -444,7 +451,8 @@ no_promotions: inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -457,9 +465,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { + struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; struct in_validator_info ivi; + struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); @@ -472,8 +481,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + ifap = &in_dev->ifa_list; + ifa1 = rtnl_dereference(*ifap); + + while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; @@ -489,6 +500,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + + ifap = &ifa1->ifa_next; + ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh @@ -514,8 +528,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifap = last_primary; } - ifa->ifa_next = *ifap; - *ifap = ifa; + rcu_assign_pointer(ifa->ifa_next, *ifap); + rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); @@ -580,12 +594,14 @@ EXPORT_SYMBOL(inetdev_by_index); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { + struct in_ifaddr *ifa; + ASSERT_RTNL(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; - } endfor_ifa(in_dev); + } return NULL; } @@ -613,10 +629,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; - struct in_ifaddr *ifa, **ifap; + struct in_ifaddr *ifa; + int err = -EINVAL; ASSERT_RTNL(); @@ -633,7 +651,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -721,15 +739,19 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap; + struct in_ifaddr __rcu **ifap; + struct in_ifaddr *tmp; - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &(*ifap)->ifa_next) { - if (*ifap == ifa) { + ifap = &ifa->ifa_dev->ifa_list; + tmp = rtnl_dereference(*ifap); + while (tmp) { + if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } + ifap = &tmp->ifa_next; + tmp = rtnl_dereference(*ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -873,13 +895,12 @@ errout: static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap; + struct in_ifaddr *ifa1; if (!ifa->ifa_local) return NULL; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -974,8 +995,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; + struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; - struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; @@ -1046,7 +1067,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1059,7 +1082,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; @@ -1208,7 +1232,7 @@ out: static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; @@ -1218,7 +1242,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s if (!in_dev) goto out; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (!buf) { done += size; continue; @@ -1246,17 +1270,22 @@ out: static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; - } endfor_ifa(in_dev); + } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { + const struct in_ifaddr *ifa; __be32 addr = 0; struct in_device *in_dev; struct net *net = dev_net(dev); @@ -1267,7 +1296,9 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!in_dev) goto no_in_dev; - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; if (ifa->ifa_scope > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { @@ -1276,7 +1307,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) } if (!addr) addr = ifa->ifa_local; - } endfor_ifa(in_dev); + } if (addr) goto out_unlock; @@ -1321,10 +1352,11 @@ EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { - int same = 0; + const struct in_ifaddr *ifa; __be32 addr = 0; + int same = 0; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!addr && (local == ifa->ifa_local || !local) && ifa->ifa_scope <= scope) { @@ -1350,7 +1382,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, same = 0; } } - } endfor_ifa(in_dev); + } return same ? addr : 0; } @@ -1424,7 +1456,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) struct in_ifaddr *ifa; int named = 0; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); @@ -1454,10 +1486,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; - for (ifa = in_dev->ifa_list; ifa; - ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, @@ -1727,15 +1758,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, int ip_idx = 0; int err; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (ip_idx < s_ip_idx) { + ip_idx++; continue; - + } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; } err = 0; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8edcfa66d1e5..2e5e377f50a1 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation @@ -5,10 +6,6 @@ * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * * ESP GRO support */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 76055c66326a..108191667531 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -6,11 +7,6 @@ * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -43,6 +39,7 @@ #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> @@ -234,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { - if (!dev || dev == res.fi->fib_dev) + struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); + + if (!dev || dev == nhc->nhc_dev) ret = res.type; } } @@ -321,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) #ifdef CONFIG_IP_ROUTE_MULTIPATH int ret; - for (ret = 0; ret < fi->fib_nhs; ret++) { - struct fib_nh *nh = &fi->fib_nh[ret]; + for (ret = 0; ret < fib_info_num_path(fi); ret++) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); - if (nh->fib_nh_dev == dev) { + if (nhc->nhc_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].fib_nh_dev == dev) + if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif @@ -540,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { - struct in_ifaddr *ifa; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; + *colon = ':'; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; + } + rcu_read_unlock(); + if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; @@ -664,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, @@ -801,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (err < 0) goto errout; break; + case RTA_NH_ID: + cfg->fc_nh_id = nla_get_u32(attr); + break; + } + } + + if (cfg->fc_nh_id) { + if (cfg->fc_oif || cfg->fc_gw_family || + cfg->fc_encap || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + return -EINVAL; } } @@ -827,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + err = -EINVAL; + goto errout; + } + tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); @@ -1177,8 +1203,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) * * Scan address list to be sure that addresses are really gone. */ - - for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; @@ -1246,6 +1272,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } } + rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) @@ -1415,6 +1442,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); + struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { @@ -1429,9 +1457,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_UP: - for_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); - } endfor_ifa(in_dev); + } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 7945f0534db7..a68b5e21ec51 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> +#include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index cfec3af54c8d..b43a7ba5c6a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -8,11 +9,6 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Thomas Graf <tgraf@suug.ch> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark @@ -31,6 +27,7 @@ #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/fib_rules.h> struct fib4_rule { @@ -145,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; - if (result->fi) - dev = result->fi->fib_dev; + if (result->fi) { + struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); + + dev = nhc->nhc_dev; + } /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 78648072783e..2db089e10ba0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -6,11 +7,6 @@ * IPv4 Forwarding Information Base: semantics. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/uaccess.h> @@ -42,6 +38,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> +#include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> @@ -60,18 +57,21 @@ static unsigned int fib_info_cnt; #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +/* for_nexthops and change_nexthops only used when nexthop object + * is not set in a fib_info. The logic within can reference fib_nh. + */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -232,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - change_nexthops(fi) { - fib_nh_release(fi->fib_net, nexthop_nh); - } endfor_nexthops(fi); + if (fi->nh) { + nexthop_put(fi->nh); + } else { + change_nexthops(fi) { + fib_nh_release(fi->fib_net, nexthop_nh); + } endfor_nexthops(fi); + } ip_fib_metrics_put(fi->fib_metrics); @@ -260,22 +264,34 @@ void fib_release_info(struct fib_info *fi) hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); - change_nexthops(fi) { - if (!nexthop_nh->fib_nh_dev) - continue; - hlist_del(&nexthop_nh->nh_hash); - } endfor_nexthops(fi) + if (fi->nh) { + list_del(&fi->nh_list); + } else { + change_nexthops(fi) { + if (!nexthop_nh->fib_nh_dev) + continue; + hlist_del(&nexthop_nh->nh_hash); + } endfor_nexthops(fi) + } fi->fib_dead = 1; fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); } -static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { - const struct fib_nh *onh = ofi->fib_nh; + const struct fib_nh *onh; + + if (fi->nh || ofi->nh) + return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; + + if (ofi->fib_nhs == 0) + return 0; for_nexthops(fi) { + onh = fib_info_nh(ofi, nhsel); + if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || @@ -296,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; - - onh++; } endfor_nexthops(fi); return 0; } @@ -311,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) (val >> (DEVINDEX_HASHBITS * 2))) & mask; } -static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, + u32 prefsrc, u32 priority) { - unsigned int mask = (fib_info_hash_size - 1); - unsigned int val = fi->fib_nhs; + unsigned int val = init_val; - val ^= (fi->fib_protocol << 8) | fi->fib_scope; - val ^= (__force u32)fi->fib_prefsrc; - val ^= fi->fib_priority; - for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); - } endfor_nexthops(fi) + val ^= (protocol << 8) | scope; + val ^= prefsrc; + val ^= priority; + + return val; +} + +static unsigned int fib_info_hashfn_result(unsigned int val) +{ + unsigned int mask = (fib_info_hash_size - 1); return (val ^ (val >> 7) ^ (val >> 12)) & mask; } -static struct fib_info *fib_find_info(const struct fib_info *nfi) +static inline unsigned int fib_info_hashfn(struct fib_info *fi) +{ + unsigned int val; + + val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, + fi->fib_scope, (__force u32)fi->fib_prefsrc, + fi->fib_priority); + + if (fi->nh) { + val ^= fib_devindex_hashfn(fi->nh->id); + } else { + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->fib_nh_oif); + } endfor_nexthops(fi) + } + + return fib_info_hashfn_result(val); +} + +/* no metrics, only nexthop id */ +static struct fib_info *fib_find_info_nh(struct net *net, + const struct fib_config *cfg) +{ + struct hlist_head *head; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + cfg->fc_protocol, cfg->fc_scope, + (__force u32)cfg->fc_prefsrc, + cfg->fc_priority); + hash = fib_info_hashfn_result(hash); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, head, fib_hash) { + if (!net_eq(fi->fib_net, net)) + continue; + if (!fi->nh || fi->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_protocol == fi->fib_protocol && + cfg->fc_scope == fi->fib_scope && + cfg->fc_prefsrc == fi->fib_prefsrc && + cfg->fc_priority == fi->fib_priority && + cfg->fc_type == fi->fib_type && + cfg->fc_table == fi->fib_tb_id && + !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) + return fi; + } + + return NULL; +} + +static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head; struct fib_info *fi; @@ -348,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && - (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + nh_comp(fi, nfi) == 0) return fi; } @@ -390,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); - if (fi->fib_nhs) { + if (fi->nh) + payload += nla_total_size(4); /* RTA_NH_ID */ + + if (nhs) { size_t nh_encapsize = 0; - /* Also handles the special case fib_nhs == 1 */ + /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ - for_nexthops(fi) { - if (nh->fib_nh_lws) { + for (i = 0; i < fib_info_num_path(fi); i++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, i); + + if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->fib_nh_lws); + nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } - } endfor_nexthops(fi); + } /* all nexthops are packed in a nested attribute */ - payload += nla_total_size((fi->fib_nhs * nhsize) + - nh_encapsize); + payload += nla_total_size((nhs * nhsize) + nh_encapsize); } @@ -578,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, return nhs; } +/* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; + struct fib_nh *nh; int ret; change_nexthops(fi) { @@ -646,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, } endfor_nexthops(fi); ret = -EINVAL; - if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { + nh = fib_info_nh(fi, 0); + if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { - if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && - fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; @@ -674,12 +753,13 @@ errout: return ret; } +/* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; - if (fi->fib_nhs < 2) + if (fib_info_num_path(fi) < 2) return; total = 0; @@ -760,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; + if (cfg->fc_nh_id) { + if (fi->nh && cfg->fc_nh_id == fi->nh->id) + return 0; + return 1; + } + if (cfg->fc_oif || cfg->fc_gw_family) { + struct fib_nh *nh = fib_info_nh(fi, 0); + if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, - fi->fib_nh, cfg, extack)) + nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && - cfg->fc_flow != fi->fib_nh->nh_tclassid) + cfg->fc_flow != nh->nh_tclassid) return 1; #endif - if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && - cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && - cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; @@ -968,7 +1056,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, { struct net_device *dev; struct fib_result res; - int err; + int err = 0; if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; @@ -1189,9 +1277,15 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } -__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh, - unsigned char scope) +__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, + unsigned char scope) { + struct fib_nh *nh; + + if (nhc->nhc_family != AF_INET) + return inet_select_addr(nhc->nhc_dev, 0, scope); + + nh = container_of(nhc, struct fib_nh, nh_common); nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); @@ -1201,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh, __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; - struct fib_nh *nh; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; - nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + } - return fib_info_update_nh_saddr(net, nh, res->fi->fib_scope); + return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) @@ -1242,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, { int err; struct fib_info *fi = NULL; + struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1261,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } + if (cfg->fc_nh_id) { + if (!cfg->fc_mx) { + fi = fib_find_info_nh(net, cfg); + if (fi) { + fi->fib_treeref++; + return fi; + } + } + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + nhs = 0; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); @@ -1296,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, extack); - if (unlikely(IS_ERR(fi->fib_metrics))) { + if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); @@ -1313,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; - change_nexthops(fi) { - nexthop_nh->nh_parent = fi; - } endfor_nexthops(fi) + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = 0; + fi->nh = nh; + } + } else { + change_nexthops(fi) { + nexthop_nh->nh_parent = fi; + } endfor_nexthops(fi) - if (cfg->fc_mp) - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - else - err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + if (cfg->fc_mp) + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, + extack); + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + } if (err != 0) goto failure; @@ -1351,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - if (cfg->fc_scope == RT_SCOPE_HOST) { + if (fi->nh) { + err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); + if (err) + goto failure; + } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ @@ -1366,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; - nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); + nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; @@ -1391,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - change_nexthops(fi) { - fib_info_update_nh_saddr(net, nexthop_nh, fi->fib_scope); - if (nexthop_nh->fib_nh_gw_family == AF_INET6) - fi->fib_nh_is_v6 = true; - } endfor_nexthops(fi) + if (!fi->nh) { + change_nexthops(fi) { + fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, + fi->fib_scope); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; + } endfor_nexthops(fi) - fib_rebalance(fi); + fib_rebalance(fi); + } link_it: ofi = fib_find_info(fi); @@ -1419,16 +1552,20 @@ link_it: head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; hlist_add_head(&fi->fib_lhash, head); } - change_nexthops(fi) { - struct hlist_head *head; - unsigned int hash; + if (fi->nh) { + list_add(&fi->nh_list, &nh->fi_list); + } else { + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; - if (!nexthop_nh->fib_nh_dev) - continue; - hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_add_head(&nexthop_nh->nh_hash, head); - } endfor_nexthops(fi) + if (!nexthop_nh->fib_nh_dev) + continue; + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nexthop_nh->nh_hash, head); + } endfor_nexthops(fi) + } spin_unlock_bh(&fib_info_lock); return fi; @@ -1555,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) if (!mp) goto nla_put_failure; + if (unlikely(fi->nh)) { + if (nexthop_mpath_fill_node(skb, fi->nh) < 0) + goto nla_put_failure; + goto mp_end; + } + for_nexthops(fi) { if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) goto nla_put_failure; @@ -1565,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif } endfor_nexthops(fi); +mp_end: nla_nest_end(skb, mp); return 0; @@ -1583,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { + unsigned int nhs = fib_info_num_path(fi); struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1618,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; - if (fi->fib_nhs == 1) { - struct fib_nh *nh = &fi->fib_nh[0]; + + if (fi->nh) { + if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) + goto nla_put_failure; + if (nexthop_is_blackhole(fi->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + } + + if (nhs == 1) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; - if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) + if (fib_nexthop_info(skb, nhc, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; + } #endif } else { if (fib_add_multipath(skb, fi) < 0) @@ -1757,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + * + * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { @@ -1838,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + struct fib_nh *nh; if (fa->fa_slen != slen) continue; @@ -1859,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].fib_nh_gw4 || - next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) + + nh = fib_info_nh(next_fi, 0); + if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1902,6 +2064,8 @@ out: /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. + * + * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { @@ -1996,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; + if (unlikely(res->fi->nh)) { + nexthop_path_fib_result(res, hash); + return; + } + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { if (!fib_good_nh(nexthop_nh)) @@ -2024,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res, goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1) { + if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b53ecef89d59..90f0fc8c87bd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. @@ -18,28 +15,19 @@ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * http://www.csc.kth.se/~snilsson/software/dyntrie2/ * - * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * - * * Code from fib_hash has been reused which includes the following header: * - * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> @@ -350,12 +338,18 @@ static struct tnode *tnode_alloc(int bits) static inline void empty_child_inc(struct key_vector *n) { - ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; + tn_info(n)->empty_children++; + + if (!tn_info(n)->empty_children) + tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { - tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; + if (!tn_info(n)->empty_children) + tn_info(n)->full_children--; + + tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) @@ -1461,6 +1455,7 @@ found: fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { +out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif @@ -1469,7 +1464,13 @@ found: } if (fi->fib_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + + if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; + } + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); if (nhc->nhc_flags & RTNH_F_DEAD) @@ -2717,14 +2718,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->fib_nh_gw4) - flags |= RTF_GATEWAY; + if (fi) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + + if (nhc->nhc_gw.ipv4) + flags |= RTF_GATEWAY; + } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; @@ -2755,7 +2760,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - const struct fib_info *fi = fa->fa_info; + struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); @@ -2768,26 +2773,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_setwidth(seq, 127); - if (fi) + if (fi) { + struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + __be32 gw = 0; + + if (nhc->nhc_gw_family == AF_INET) + gw = nhc->nhc_gw.ipv4; + seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->fib_nh_gw4, flags, 0, 0, + nhc->nhc_dev ? nhc->nhc_dev->name : "*", + prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); - else + } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); - + } seq_pad(seq, '\n'); } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7c4a41dc04bb..293acfb36376 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * GRE over IPv4 demultiplexer driver * * Authors: Dmitry Kozlov (xeb@mail.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6c63524f598a..4de7e962d3da 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * GRE GSO support */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f3a5893b1e86..1510e951f451 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Implementation of the ICMP protocol layer. * * Alan Cox, <alan@lxorguk.ukuu.org.uk> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Some of the function names and the icmp unreach table for this * module were derived from [icmp.c 1.0.11 06/02/93] by * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. @@ -59,7 +55,6 @@ * * - Should use skb_pull() instead of all the manual checking. * This would also greatly simply some upper layer error handlers. --AK - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -206,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return *this_cpu_ptr(net->ipv4.icmp_sk); + return this_cpu_read(*net->ipv4.icmp_sk); } /* Called with BH disabled */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index eb03153dfe12..9a206931a342 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: Internet Group Management Protocol [IGMP] * @@ -11,11 +12,6 @@ * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: * * Alan Cox : Added lots of __inline__ to optimise @@ -336,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; - } endfor_ifa(in_dev); + } return htonl(INADDR_ANY); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a175e3e7ae97..fb0b4b0994ec 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -6,11 +7,6 @@ * Support for INET connection oriented protocols. * * Authors: See the TCP sources - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. */ #include <linux/module.h> @@ -653,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) EXPORT_SYMBOL(inet_rtx_syn_ack); /* return true if req was found in the ehash table */ -static bool reqsk_queue_unlink(struct request_sock_queue *queue, - struct request_sock *req) +static bool reqsk_queue_unlink(struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; @@ -673,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { - if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + if (reqsk_queue_unlink(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5731670c560b..bbb005eb5218 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * inet_diag.c Module for monitoring INET transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 35e9784fab4e..d666756be5f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly @@ -149,10 +145,9 @@ static void inet_frags_free_cb(void *ptr, void *arg) inet_frag_destroy(fq); } -static void fqdir_rwork_fn(struct work_struct *work) +static void fqdir_work_fn(struct work_struct *work) { - struct fqdir *fqdir = container_of(to_rcu_work(work), - struct fqdir, destroy_rwork); + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); struct inet_frags *f = fqdir->f; rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); @@ -191,18 +186,8 @@ EXPORT_SYMBOL(fqdir_init); void fqdir_exit(struct fqdir *fqdir) { - fqdir->high_thresh = 0; /* prevent creation of new frags */ - - fqdir->dead = true; - - /* call_rcu is supposed to provide memory barrier semantics, - * separating the setting of fqdir->dead with the destruction - * work. This implicit barrier is paired with inet_frag_kill(). - */ - - INIT_RCU_WORK(&fqdir->destroy_rwork, fqdir_rwork_fn); - queue_rcu_work(system_wq, &fqdir->destroy_rwork); - + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(system_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 942265d65eb3..97824864e40d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -6,11 +7,6 @@ * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -320,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1ffaec056821..4385eb9e781f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -143,6 +143,10 @@ static void ip_expire(struct timer_list *t) net = qp->q.fqdir->net; rcu_read_lock(); + + if (qp->q.fqdir->dead) + goto out_rcu_unlock; + spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -676,6 +680,11 @@ static int __net_init ipv4_frags_init_net(struct net *net) return res; } +static void __net_exit ipv4_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv4.fqdir); +} + static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); @@ -683,8 +692,9 @@ static void __net_exit ipv4_frags_exit_net(struct net *net) } static struct pernet_operations ip4_frags_ops = { - .init = ipv4_frags_init_net, - .exit = ipv4_frags_exit_net, + .init = ipv4_frags_init_net, + .pre_exit = ipv4_frags_pre_exit_net, + .exit = ipv4_frags_exit_net, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4b0526441476..a53a543fe055 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: GRE over IP protocol decoder. * * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index ed97724c5e33..1e2392b7c64e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -14,7 +15,6 @@ * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * - * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code @@ -96,8 +96,6 @@ * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * - * - * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit @@ -106,11 +104,6 @@ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv4: " fmt diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ceca5285d9b4..cdd6c3418b9e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -575,8 +575,7 @@ void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, { unsigned int first_len = skb_pagelen(skb); - iter->frag_list = skb_shinfo(skb)->frag_list; - iter->frag = iter->frag_list; + iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; @@ -845,7 +844,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(iter.frag_list); + kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; @@ -995,7 +994,7 @@ static int __ip_append_data(struct sock *sk, uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; - extra_uref = !skb; /* only extra ref if !MSG_MORE */ + extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; @@ -1633,7 +1632,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len) + unsigned int len, u64 transmit_time) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1649,6 +1648,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipcm_init(&ipc); ipc.addr = daddr; + ipc.sockc.transmit_time = transmit_time; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a5d8cad18ead..38c02bb62e2c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 30c1c264bdfc..9e3846388fb3 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 254a42e83ff9..cfb025606793 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -1,15 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ /* diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 9119d012ba46..2f4cdcc13d53 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Todo: * - Tunable compression parameters. * - Compression stats. diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fe10b9a2efc8..43adfc1641ba 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder. * @@ -16,12 +17,6 @@ * Carlos Picoto : GRE over IP support * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. * I do not want to merge them together. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ /* tunnel.c: an IP tunnel driver diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2c61e10a60e3..c07bc82cbbe9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IP multicast routing support for mrouted 3.6/3.8 * * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code @@ -23,7 +19,6 @@ * Carlos Picoto : PIMv1 Support * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header * Relax this requirement to work with older peers. - * */ #include <linux/uaccess.h> diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 87ca2c42359b..a4e07e5e9c11 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par) unsigned char *arpptr; int pln, hln; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index aaaf9a81fbc9..9f6751893660 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -32,7 +32,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; iph = ip_hdr(skb); oldtos = iph->tos; @@ -61,7 +61,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr)) return true; - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) return false; tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 7875c98072eb..15f2b2604890 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -59,7 +59,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } - /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index 657d2dcec3cc..717b726504fe 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 164714104965..40c93b3bd731 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -53,6 +53,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { + const struct in_ifaddr *ifa; struct in_device *indev; __be32 laddr; @@ -61,10 +62,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { + + in_dev_for_each_ifa_rcu(ifa, indev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + laddr = ifa->ifa_local; break; - } endfor_ifa(indev); + } return laddr ? laddr : daddr; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 7a5a3d08fec3..5fe5a3981d43 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -105,6 +105,8 @@ static struct nexthop *nexthop_alloc(void) nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { + INIT_LIST_HEAD(&nh->fi_list); + INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); } return nh; @@ -515,6 +517,162 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) } EXPORT_SYMBOL_GPL(nexthop_select_path); +int nexthop_for_each_fib6_nh(struct nexthop *nh, + int (*cb)(struct fib6_nh *nh, void *arg), + void *arg) +{ + struct nh_info *nhi; + int err; + + if (nh->is_group) { + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_rtnl(nh->nh_grp); + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + nhi = rcu_dereference_rtnl(nhge->nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + } else { + nhi = rcu_dereference_rtnl(nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); + +static int check_src_addr(const struct in6_addr *saddr, + struct netlink_ext_ack *extack) +{ + if (!ipv6_addr_any(saddr)) { + NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); + return -EINVAL; + } + return 0; +} + +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + /* fib6_src is unique to a fib6_info and limits the ability to cache + * routes in fib6_nh within a nexthop that is potentially shared + * across multiple fib entries. If the config wants to use source + * routing it can not use nexthop objects. mlxsw also does not allow + * fib6_src on routes. + */ + if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) + return -EINVAL; + + if (nh->is_group) { + struct nh_group *nhg; + + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->has_v4) + goto no_v4_nh; + } else { + nhi = rtnl_dereference(nh->nh_info); + if (nhi->family == AF_INET) + goto no_v4_nh; + } + + return 0; +no_v4_nh: + NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(fib6_check_nexthop); + +/* if existing nexthop has ipv6 routes linked to it, need + * to verify this new spec works with ipv6 + */ +static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib6_info *f6i; + + if (list_empty(&old->f6i_list)) + return 0; + + list_for_each_entry(f6i, &old->f6i_list, nh_list) { + if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) + return -EINVAL; + } + + return fib6_check_nexthop(new, NULL, extack); +} + +static int nexthop_check_scope(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); + return -EINVAL; + } + + if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); + return -EINVAL; + } + + return 0; +} + +/* Invoked by fib add code to verify nexthop by id is ok with + * config for prefix; parts of fib_check_nh not done when nexthop + * object is used. + */ +int fib_check_nexthop(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + int err = 0; + + if (nh->is_group) { + struct nh_group *nhg; + + if (scope == RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); + err = -EINVAL; + goto out; + } + + nhg = rtnl_dereference(nh->nh_grp); + /* all nexthops in a group have the same scope */ + err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + } else { + err = nexthop_check_scope(nh, scope, extack); + } +out: + return err; +} + +static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib_info *fi; + + list_for_each_entry(fi, &old->fi_list, nh_list) { + int err; + + err = fib_check_nexthop(new, fi->fib_scope, extack); + if (err) + return err; + } + return 0; +} + static void nh_group_rebalance(struct nh_group *nhg) { int total = 0; @@ -607,9 +765,33 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) } } +/* not called for nexthop replace */ +static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i, *tmp; + bool do_flush = false; + struct fib_info *fi; + + list_for_each_entry(fi, &nh->fi_list, nh_list) { + fi->fib_flags |= RTNH_F_DEAD; + do_flush = true; + } + if (do_flush) + fib_flush(net); + + /* ip6_del_rt removes the entry from this list hence the _safe */ + list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + /* __ip6_del_rt does a release, so do a hold here */ + fib6_info_hold(f6i); + ipv6_stub->ip6_del_rt(net, f6i); + } +} + static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { + __remove_nexthop_fib(net, nh); + if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { @@ -638,10 +820,171 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, nexthop_put(nh); } +/* if any FIB entries reference this nexthop, any dst entries + * need to be regenerated + */ +static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) + rt_cache_flush(net); + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_update_sernum(net, f6i); +} + +static int replace_nexthop_grp(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_group *oldg, *newg; + int i; + + if (!new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); + return -EINVAL; + } + + oldg = rtnl_dereference(old->nh_grp); + newg = rtnl_dereference(new->nh_grp); + + /* update parents - used by nexthop code for cleanup */ + for (i = 0; i < newg->num_nh; i++) + newg->nh_entries[i].nh_parent = old; + + rcu_assign_pointer(old->nh_grp, newg); + + for (i = 0; i < oldg->num_nh; i++) + oldg->nh_entries[i].nh_parent = new; + + rcu_assign_pointer(new->nh_grp, oldg); + + return 0; +} + +static int replace_nexthop_single(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi, *newi; + + if (new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); + return -EINVAL; + } + + oldi = rtnl_dereference(old->nh_info); + newi = rtnl_dereference(new->nh_info); + + newi->nh_parent = old; + oldi->nh_parent = new; + + old->protocol = new->protocol; + old->nh_flags = new->nh_flags; + + rcu_assign_pointer(old->nh_info, newi); + rcu_assign_pointer(new->nh_info, oldi); + + return 0; +} + +static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) { + struct fib_info *fi; + + /* expectation is a few fib_info per nexthop and then + * a lot of routes per fib_info. So mark the fib_info + * and then walk the fib tables once + */ + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = true; + + fib_info_notify_update(net, info); + + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = false; + } + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_rt_update(net, f6i, info); +} + +/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries + * linked to this nexthop and for all groups that the nexthop + * is a member of + */ +static void nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct nh_grp_entry *nhge; + + __nexthop_replace_notify(net, nh, info); + + list_for_each_entry(nhge, &nh->grp_list, nh_list) + __nexthop_replace_notify(net, nhge->nh_parent, info); +} + static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { - return -EEXIST; + bool new_is_reject = false; + struct nh_grp_entry *nhge; + int err; + + /* check that existing FIB entries are ok with the + * new nexthop definition + */ + err = fib_check_nh_list(old, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(old, new, extack); + if (err) + return err; + + if (!new->is_group) { + struct nh_info *nhi = rtnl_dereference(new->nh_info); + + new_is_reject = nhi->reject_nh; + } + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + /* if new nexthop is a blackhole, any groups using this + * nexthop cannot have more than 1 path + */ + if (new_is_reject && + nexthop_num_path(nhge->nh_parent) > 1) { + NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); + return -EINVAL; + } + + err = fib_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + } + + if (old->is_group) + err = replace_nexthop_grp(net, old, new, extack); + else + err = replace_nexthop_single(net, old, new, extack); + + if (!err) { + nh_rt_cache_flush(net, old); + + __remove_nexthop(net, new, NULL); + nexthop_put(new); + } + + return err; } /* called with rtnl_lock held */ @@ -653,6 +996,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; + int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; @@ -672,8 +1016,10 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, extack); - if (!rc) + if (!rc) { new_nh = nh; /* send notification with old nh */ + replace_notify = 1; + } goto out; } else { /* id already exists and not a replace */ @@ -694,6 +1040,8 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); + if (replace_notify) + nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; @@ -815,7 +1163,8 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; - fib_info_update_nh_saddr(net, fib_nh, fib_nh->fib_nh_scope); + fib_info_update_nhc_saddr(net, &fib_nh->nh_common, + fib_nh->fib_nh_scope); } else { fib_nh_release(net, fib_nh); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 834be7daeb32..9d24ef5c5d8f 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -5,11 +6,6 @@ * * "Ping" sockets * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), @@ -17,7 +13,6 @@ * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. - * */ #include <linux/uaccess.h> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4746f963c439..cc90243ccf76 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -25,11 +26,6 @@ * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> #include <net/net_namespace.h> @@ -291,6 +287,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), + SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 92d249e053be..9a8c0892622b 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -16,11 +17,6 @@ * Richard Colella : Hang on hash collision * Vince Laviano : Modified inet_del_protocol() to correctly * maintain copy bit. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/cache.h> #include <linux/module.h> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0e482f07b37f..0b8e06ca75d6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -30,11 +31,6 @@ * Alan Cox : Added IP_HDRINCL option. * Alan Cox : Skip broadcast check if BSDism set. * David S. Miller : New socket lookup architecture. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11ddc276776e..66cbe8a7a168 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -55,11 +56,6 @@ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv4: " fmt @@ -99,6 +95,7 @@ #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> @@ -1584,7 +1581,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID - { + if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); @@ -1933,6 +1930,23 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.basic.ip_proto = fl4->flowi4_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + /* skb is currently provided only when forwarding */ + if (skb) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, 0); + + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else { + /* Same as case 0 */ + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1950,7 +1964,7 @@ static int ip_mkroute_input(struct sk_buff *skb, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); @@ -1985,7 +1999,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; struct rtable *rth; struct flowi4 fl4; - bool do_cache; + bool do_cache = true; /* IP on this device is disabled. */ @@ -2062,6 +2076,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; + /* not do cache if bc_forwarding is enabled */ + if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) + do_cache = false; goto brd_input; } @@ -2099,18 +2116,15 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - do_cache = false; - if (res->fi) { - if (!itag) { - struct fib_nh_common *nhc = FIB_RES_NHC(*res); + do_cache &= res->fi && !itag; + if (do_cache) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - rth = rcu_dereference(nhc->nhc_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - err = 0; - goto out; - } - do_cache = true; + rth = rcu_dereference(nhc->nhc_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; } } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 008545f63667..535b69326f66 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Syncookies implementation for the Linux kernel * * Copyright (C) 1997 Andi Kleen * Based on ideas by D.J.Bernstein and Eric Schenk. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/tcp.h> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 90f09e47198b..7d802acde040 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -39,6 +39,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; +static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -601,6 +603,18 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, + { + .procname = "tcp_rx_skb_cache", + .data = &tcp_rx_skb_cache_key.key, + .mode = 0644, + .proc_handler = proc_do_static_key, + }, + { + .procname = "tcp_tx_skb_cache", + .data = &tcp_tx_skb_cache_key.key, + .mode = 0644, + .proc_handler = proc_do_static_key, + }, { } }; @@ -800,6 +814,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_min_snd_mss", + .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, + { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), @@ -1008,7 +1031,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, - .extra2 = &one, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27ce13ece510..efd7f2b1d1f0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -205,11 +206,6 @@ * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. - * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack @@ -321,6 +317,11 @@ struct tcp_splice_state { unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); +DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); +EXPORT_SYMBOL(tcp_rx_skb_cache_key); + +DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); + void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; @@ -2740,6 +2741,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -3091,6 +3107,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3550,6 +3571,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; @@ -3883,6 +3908,7 @@ void __init tcp_init(void) unsigned long limit; unsigned int i; + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 477cb4aa456c..79f705450c16 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* DataCenter TCP (DCTCP) congestion control. * * http://simula.stanford.edu/~alizade/Site/DCTCP.html @@ -33,11 +34,6 @@ * Daniel Borkmann <dborkman@redhat.com> * Florian Westphal <fw@strlen.de> * Glenn Judd <glenn.judd@morganstanley.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. */ #include <linux/module.h> diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 81148f7a2323..a3a386236d93 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8e1580485c9e..46b67128e1ca 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -7,6 +7,7 @@ #include <linux/tcp.h> #include <linux/rcupdate.h> #include <linux/rculist.h> +#include <linux/siphash.h> #include <net/inetpeer.h> #include <net/tcp.h> @@ -37,14 +38,8 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); - int i; - /* We own ctx, thus no need to hold the Fastopen-lock */ - for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) { - if (ctx->tfm[i]) - crypto_free_cipher(ctx->tfm[i]); - } - kfree(ctx); + kzfree(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) @@ -72,41 +67,6 @@ void tcp_fastopen_ctx_destroy(struct net *net) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } -struct tcp_fastopen_context *tcp_fastopen_alloc_ctx(void *primary_key, - void *backup_key, - unsigned int len) -{ - struct tcp_fastopen_context *new_ctx; - void *key = primary_key; - int err, i; - - new_ctx = kmalloc(sizeof(*new_ctx), GFP_KERNEL); - if (!new_ctx) - return ERR_PTR(-ENOMEM); - for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) - new_ctx->tfm[i] = NULL; - for (i = 0; i < (backup_key ? 2 : 1); i++) { - new_ctx->tfm[i] = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(new_ctx->tfm[i])) { - err = PTR_ERR(new_ctx->tfm[i]); - new_ctx->tfm[i] = NULL; - pr_err("TCP: TFO aes cipher alloc error: %d\n", err); - goto out; - } - err = crypto_cipher_setkey(new_ctx->tfm[i], key, len); - if (err) { - pr_err("TCP: TFO cipher key error: %d\n", err); - goto out; - } - memcpy(&new_ctx->key[i * TCP_FASTOPEN_KEY_LENGTH], key, len); - key = backup_key; - } - return new_ctx; -out: - tcp_fastopen_ctx_free(&new_ctx->rcu); - return ERR_PTR(err); -} - int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key, unsigned int len) @@ -115,11 +75,20 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, struct fastopen_queue *q; int err = 0; - ctx = tcp_fastopen_alloc_ctx(primary_key, backup_key, len); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; goto out; } + + memcpy(ctx->key[0], primary_key, len); + if (backup_key) { + memcpy(ctx->key[1], backup_key, len); + ctx->num = 2; + } else { + ctx->num = 1; + } + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; @@ -141,31 +110,30 @@ out: static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, - struct crypto_cipher *tfm, + const u8 *key, struct tcp_fastopen_cookie *foc) { + BUILD_BUG_ON(TCP_FASTOPEN_KEY_LENGTH != sizeof(siphash_key_t)); + BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); + if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); - __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - crypto_cipher_encrypt_one(tfm, foc->val, (void *)path); + foc->val[0] = siphash(&iph->saddr, + sizeof(iph->saddr) + + sizeof(iph->daddr), + (const siphash_key_t *)key); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } - #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); - struct tcp_fastopen_cookie tmp; - struct in6_addr *buf; - int i; - - crypto_cipher_encrypt_one(tfm, tmp.val, - (void *)&ip6h->saddr); - buf = &tmp.addr; - for (i = 0; i < 4; i++) - buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - crypto_cipher_encrypt_one(tfm, foc->val, (void *)buf); + + foc->val[0] = siphash(&ip6h->saddr, + sizeof(ip6h->saddr) + + sizeof(ip6h->daddr), + (const siphash_key_t *)key); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } @@ -173,11 +141,8 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, return false; } -/* Generate the fastopen cookie by doing aes128 encryption on both - * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 - * addresses. For the longer IPv6 addresses use CBC-MAC. - * - * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. +/* Generate the fastopen cookie by applying SipHash to both the source and + * destination addresses. */ static void tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, @@ -189,7 +154,7 @@ static void tcp_fastopen_cookie_gen(struct sock *sk, rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (ctx) - __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[0], foc); + __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[0], foc); rcu_read_unlock(); } @@ -253,7 +218,7 @@ static int tcp_fastopen_cookie_gen_check(struct sock *sk, if (!ctx) goto out; for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { - __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[i], foc); + __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[i], foc); if (tcp_fastopen_cookie_match(foc, orig)) { ret = i + 1; goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 08a477e74cf3..b71efeb0ae5b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { icsk->icsk_clean_acked = cad; - static_branch_inc(&clean_acked_data_enabled.key); + static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); @@ -1302,7 +1302,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); - BUG_ON(tcp_skb_pcount(skb) < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1368,6 +1368,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1473,7 +1488,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; - if (!skb_shift(prev, skb, len)) + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1491,11 +1506,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, pcount, len)) + tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, 0); - } out: return prev; @@ -2648,7 +2662,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); - if ((flag & FLAG_SND_UNA_ADVANCED) && + if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) && tcp_try_undo_loss(sk, false)) return; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index af81e4a6a8d8..d57641cb3477 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -7,18 +8,12 @@ * * IPv4 specific functions * - * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ /* @@ -667,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - struct net *net; + u64 transmit_time = 0; struct sock *ctl_sk; + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -771,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -813,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct net *net = sock_net(sk); struct ip_reply_arg arg; struct sock *ctl_sk; + u64 transmit_time; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -863,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -2633,6 +2634,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c35731816e2..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); + tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 0fbf7d4df9da..e09147ac9a99 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * TCPv4 GSO/GRO support */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f429e856e263..f016bb516dd6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -1296,6 +1298,11 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (nsize < 0) nsize = 0; + if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); + return -ENOMEM; + } + if (skb_unclone(skb, gfp)) return -ENOMEM; @@ -1454,8 +1461,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); return mss_now; } @@ -2234,6 +2240,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -2747,7 +2765,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) if (next_skb_size <= skb_availroom(skb)) skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), next_skb_size); - else if (!skb_shift(skb, next_skb, next_skb_size)) + else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; } tcp_highest_sack_replace(sk, next_skb, skb); @@ -3212,6 +3230,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3243,13 +3262,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3292,8 +3312,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5bad937ce779..c801cd37cc2a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -155,6 +155,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8fb250ed53d4..1b971bd95786 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -69,12 +70,6 @@ * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support * James Chapman : Add L2TP encapsulation type. - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "UDP: " fmt @@ -130,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) -/* IPCB reference means this can not be used from early demux */ -static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -369,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -425,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, + int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -437,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, @@ -465,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - bool exact_dif = udp_lib_exact_dif_match(net, skb); hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -473,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); if (!result) { hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -481,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -503,7 +486,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { - return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table); + const struct iphdr *iph = ip_hdr(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); @@ -538,8 +525,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; @@ -1779,6 +1765,10 @@ try_again: sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); + + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 5cbb9be05295..910555a4d9fe 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 065334b41d57..0112f64faf69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * UDPv4 GSO support */ @@ -212,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, gso_skb->destructor = NULL; segs = skb_segment(gso_skb, features); - if (unlikely(IS_ERR_OR_NULL(segs))) { + if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) gso_skb->destructor = sock_wfree; return segs; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 3c94b8f0ff27..5936d66d1ce2 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * @@ -5,10 +6,6 @@ * * Changes: * Fixes: - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "UDPLite: " fmt diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 9bb8905088c7..ecff3fce9807 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm4_output.c - Common IPsec encapsulation code for IPv4. * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/if_ether.h> diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index bcab48944c15..8a4285712808 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* xfrm4_protocol.c - Generic xfrm protocol multiplexer. * * Copyright (C) 2013 secunet Security Networks AG @@ -7,11 +8,6 @@ * * Based on: * net/ipv4/tunnel4.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/init.h> |

