summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2017-11-29 12:47:41 +1100
committerJames Morris <james.l.morris@oracle.com>2017-11-29 12:47:41 +1100
commitcf40a76e7d5874bb25f4404eecc58a2e033af885 (patch)
tree8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /net/ipv4
parentab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff)
parent4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff)
downloadtalos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.tar.gz
talos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.zip
Merge tag 'v4.15-rc1' into next-seccomp
Linux 4.15-rc1
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c35
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/arp.c3
-rw-r--r--net/ipv4/cipso_ipv4.c36
-rw-r--r--net/ipv4/devinet.c51
-rw-r--r--net/ipv4/esp4.c86
-rw-r--r--net/ipv4/esp4_offload.c7
-rw-r--r--net/ipv4/fib_frontend.c57
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_notifier.c102
-rw-r--r--net/ipv4/fib_rules.c44
-rw-r--r--net/ipv4/fib_semantics.c96
-rw-r--r--net/ipv4/fib_trie.c50
-rw-r--r--net/ipv4/fou.c1
-rw-r--r--net/ipv4/gre_offload.c16
-rw-r--r--net/ipv4/icmp.c24
-rw-r--r--net/ipv4/igmp.c42
-rw-r--r--net/ipv4/inet_connection_sock.c75
-rw-r--r--net/ipv4/inet_diag.c33
-rw-r--r--net/ipv4/inet_fragment.c10
-rw-r--r--net/ipv4/inet_hashtables.c32
-rw-r--r--net/ipv4/inet_timewait_sock.c10
-rw-r--r--net/ipv4/inetpeer.c434
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c18
-rw-r--r--net/ipv4/ip_gre.c493
-rw-r--r--net/ipv4/ip_input.c25
-rw-r--r--net/ipv4/ip_options.c10
-rw-r--r--net/ipv4/ip_output.c90
-rw-r--r--net/ipv4/ip_sockglue.c22
-rw-r--r--net/ipv4/ip_tunnel.c18
-rw-r--r--net/ipv4/ip_vti.c50
-rw-r--r--net/ipv4/ipconfig.c1
-rw-r--r--net/ipv4/ipip.c66
-rw-r--r--net/ipv4/ipmr.c279
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/arp_tables.c35
-rw-r--r--net/ipv4/netfilter/ip_tables.c49
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c6
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c5
-rw-r--r--net/ipv4/netfilter/iptable_nat.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c42
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c43
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c2
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c57
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c20
-rw-r--r--net/ipv4/proc.c8
-rw-r--r--net/ipv4/raw.c18
-rw-r--r--net/ipv4/raw_diag.c4
-rw-r--r--net/ipv4/route.c100
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c524
-rw-r--r--net/ipv4/tcp.c454
-rw-r--r--net/ipv4/tcp_bic.c14
-rw-r--r--net/ipv4/tcp_cdg.c14
-rw-r--r--net/ipv4/tcp_cong.c97
-rw-r--r--net/ipv4/tcp_cubic.c13
-rw-r--r--net/ipv4/tcp_diag.c109
-rw-r--r--net/ipv4/tcp_fastopen.c163
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_illinois.c11
-rw-r--r--net/ipv4/tcp_input.c769
-rw-r--r--net/ipv4/tcp_ipv4.c184
-rw-r--r--net/ipv4/tcp_metrics.c23
-rw-r--r--net/ipv4/tcp_minisocks.c42
-rw-r--r--net/ipv4/tcp_nv.c60
-rw-r--r--net/ipv4/tcp_offload.c12
-rw-r--r--net/ipv4/tcp_output.c456
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_recovery.c105
-rw-r--r--net/ipv4/tcp_scalable.c16
-rw-r--r--net/ipv4/tcp_timer.c92
-rw-r--r--net/ipv4/tcp_ulp.c14
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_vegas.h1
-rw-r--r--net/ipv4/tcp_veno.c11
-rw-r--r--net/ipv4/tcp_yeah.c11
-rw-r--r--net/ipv4/udp.c177
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/udp_impl.h1
-rw-r--r--net/ipv4/udp_offload.c19
-rw-r--r--net/ipv4/udp_tunnel.c25
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_policy.c26
-rw-r--r--net/ipv4/xfrm4_state.c1
93 files changed, 3493 insertions, 2732 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 91a2557942fa..f48fe6fc7e8c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -70,11 +70,9 @@ config IP_MULTIPLE_TABLES
address into account. Furthermore, the TOS (Type-Of-Service) field
of the packet can be used for routing decisions as well.
- If you are interested in this, please see the preliminary
- documentation at <http://www.compendium.com.ar/policy-routing.txt>
- and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
- You will need supporting software from
- <ftp://ftp.tux.org/pub/net/ip-routing/>.
+ If you need more information, see the Linux Advanced
+ Routing and Traffic Control documentation at
+ <http://lartc.org/howto/lartc.rpdb.html>
If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index afcb435adfbe..c6c8ad1d4b6d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux TCP/IP (INET) layer.
#
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 76c2077c3f5b..f00499a46927 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
- int err;
+ int err, tcp_fastopen;
lock_sock(sk);
@@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog)
* because the socket was in TCP_LISTEN state previously but
* was shutdown() rather than close().
*/
- if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
- (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+ (tcp_fastopen & TFO_SERVER_ENABLE) &&
!inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
fastopen_queue_tune(sk, backlog);
- tcp_fastopen_init_key_once(true);
+ tcp_fastopen_init_key_once(sock_net(sk));
}
err = inet_csk_listen_start(sk, backlog);
@@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how)
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ /* fall through */
default:
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
@@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* Fall through */
+ /* fall through */
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -944,6 +946,8 @@ const struct proto_ops inet_stream_ops = {
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
+ .sendmsg_locked = tcp_sendmsg_locked,
+ .sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
@@ -1602,6 +1606,9 @@ static const struct net_protocol igmp_protocol = {
};
#endif
+/* thinking of making this const? Don't.
+ * early_demux can change based on sysctl.
+ */
static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.early_demux_handler = tcp_v4_early_demux,
@@ -1612,6 +1619,9 @@ static struct net_protocol tcp_protocol = {
.icmp_strict_tag_validation = 1,
};
+/* thinking of making this const? Don't.
+ * early_demux can change based on sysctl.
+ */
static struct net_protocol udp_protocol = {
.early_demux = udp_v4_early_demux,
.early_demux_handler = udp_v4_early_demux,
@@ -1731,6 +1741,13 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif
+ /* Some igmp sysctl, whose values are always used */
+ net->ipv4.sysctl_igmp_max_memberships = 20;
+ net->ipv4.sysctl_igmp_max_msf = 10;
+ /* IGMP reports for link-local multicast groups are enabled by default */
+ net->ipv4.sysctl_igmp_llm_reports = 1;
+ net->ipv4.sysctl_igmp_qrv = 2;
+
return 0;
}
@@ -1771,6 +1788,11 @@ static const struct net_offload ipip_offload = {
},
};
+static int __init ipip_offload_init(void)
+{
+ return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+}
+
static int __init ipv4_offload_init(void)
{
/*
@@ -1780,9 +1802,10 @@ static int __init ipv4_offload_init(void)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (tcpv4_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+ if (ipip_offload_init() < 0)
+ pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
- inet_add_offload(&ipip_offload, IPPROTO_IPIP);
return 0;
}
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 37db44f60718..4dd95cdd8070 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -240,7 +240,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
if (err == -EINPROGRESS)
goto out;
- if (err == -EBUSY)
+ if (err == -ENOSPC)
err = NET_XMIT_DROP;
goto out_free;
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 8b52179ddc6e..a8d7c5a9fb05 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -171,7 +171,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
@@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSARP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
+ /* fall through */
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
if (err)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index c4c6e1969ed0..82178cc69c96 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1523,9 +1523,17 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int taglen;
for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
- if (optptr[0] == IPOPT_CIPSO)
+ switch (optptr[0]) {
+ case IPOPT_CIPSO:
return optptr;
- taglen = optptr[1];
+ case IPOPT_END:
+ return NULL;
+ case IPOPT_NOOP:
+ taglen = 1;
+ break;
+ default:
+ taglen = optptr[1];
+ }
optlen -= taglen;
optptr += taglen;
}
@@ -1943,7 +1951,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
buf = NULL;
req_inet = inet_rsk(req);
- opt = xchg(&req_inet->opt, opt);
+ opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
if (opt)
kfree_rcu(opt, rcu);
@@ -1965,11 +1973,13 @@ req_setattr_failure:
* values on failure.
*
*/
-static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
{
+ struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
int hdr_delta = 0;
- struct ip_options_rcu *opt = *opt_ptr;
+ if (!opt || opt->opt.cipso == 0)
+ return 0;
if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
u8 cipso_len;
u8 cipso_off;
@@ -2031,14 +2041,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
*/
void cipso_v4_sock_delattr(struct sock *sk)
{
- int hdr_delta;
- struct ip_options_rcu *opt;
struct inet_sock *sk_inet;
+ int hdr_delta;
sk_inet = inet_sk(sk);
- opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
- if (!opt || opt->opt.cipso == 0)
- return;
hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
if (sk_inet->is_icsk && hdr_delta > 0) {
@@ -2058,15 +2064,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
*/
void cipso_v4_req_delattr(struct request_sock *req)
{
- struct ip_options_rcu *opt;
- struct inet_request_sock *req_inet;
-
- req_inet = inet_rsk(req);
- opt = req_inet->opt;
- if (!opt || opt->opt.cipso == 0)
- return;
-
- cipso_v4_delopt(&req_inet->opt);
+ cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
}
/**
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 38d9af9b917c..a4573bccd6da 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
*/
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
- u32 hash = inet_addr_hash(net, addr);
struct net_device *result = NULL;
struct in_ifaddr *ifa;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
- if (ifa->ifa_local == addr) {
- struct net_device *dev = ifa->ifa_dev->dev;
-
- if (!net_eq(dev_net(dev), net))
- continue;
- result = dev;
- break;
- }
- }
- if (!result) {
+ ifa = inet_lookup_ifaddr_rcu(net, addr);
+ if (!ifa) {
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res = { 0 };
struct fib_table *local;
@@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
!fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
res.type == RTN_LOCAL)
result = FIB_RES_DEV(res);
+ } else {
+ result = ifa->ifa_dev->dev;
}
if (result && devref)
dev_hold(result);
@@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
}
EXPORT_SYMBOL(__ip_dev_find);
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct in_ifaddr *ifa;
+
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
+ if (ifa->ifa_local == addr &&
+ net_eq(dev_net(ifa->ifa_dev->dev), net))
+ return ifa;
+
+ return NULL;
+}
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -438,7 +444,7 @@ static void check_lifetime(struct work_struct *work);
static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 portid)
+ u32 portid, struct netlink_ext_ack *extack)
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -483,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
*/
ivi.ivi_addr = ifa->ifa_address;
ivi.ivi_dev = ifa->ifa_dev;
+ ivi.extack = extack;
ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
NETDEV_UP, &ivi);
ret = notifier_to_errno(ret);
@@ -515,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
- return __inet_insert_ifa(ifa, NULL, 0);
+ return __inet_insert_ifa(ifa, NULL, 0, NULL);
}
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
@@ -896,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return ret;
}
}
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
+ extack);
} else {
inet_free_ifa(ifa);
@@ -1516,6 +1524,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (inetdev_valid_mtu(dev->mtu))
break;
/* disable IP when MTU is not enough */
+ /* fall through */
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -1751,7 +1760,7 @@ static int inet_validate_link_af(const struct net_device *dev,
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int err, rem;
- if (dev && !__in_dev_get_rtnl(dev))
+ if (dev && !__in_dev_get_rcu(dev))
return -EAFNOSUPPORT;
err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL);
@@ -1775,7 +1784,7 @@ static int inet_validate_link_af(const struct net_device *dev,
static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int rem;
@@ -2491,9 +2500,9 @@ void __init devinet_init(void)
rtnl_af_register(&inet_af_ops);
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
- inet_netconf_dump_devconf, NULL);
+ inet_netconf_dump_devconf, 0);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 0cbee0a666ff..d57aa64fa7c7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -258,7 +258,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
esp_output_udp_encap(x, skb, esp);
if (!skb_cloned(skb)) {
- if (tailen <= skb_availroom(skb)) {
+ if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
trailer = skb;
tail = skb_tail_pointer(trailer);
@@ -292,8 +292,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
kunmap_atomic(vaddr);
- spin_unlock_bh(&x->lock);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
@@ -301,6 +299,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
skb_shinfo(skb)->nr_frags = ++nfrags;
pfrag->offset = pfrag->offset + allocsize;
+
+ spin_unlock_bh(&x->lock);
+
nfrags++;
skb->len += tailen;
@@ -381,7 +382,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
if (!esp->inplace) {
int allocsize;
@@ -392,7 +393,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
spin_lock_bh(&x->lock);
if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
spin_unlock_bh(&x->lock);
- goto error;
+ goto error_free;
}
skb_shinfo(skb)->nr_frags = 1;
@@ -409,7 +410,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
}
if ((x->props.flags & XFRM_STATE_ESN))
@@ -431,7 +432,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
case -EINPROGRESS:
goto error;
- case -EBUSY:
+ case -ENOSPC:
err = NET_XMIT_DROP;
break;
@@ -442,8 +443,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
if (sg != dsg)
esp_ssg_unref(x, tmp);
- kfree(tmp);
+error_free:
+ kfree(tmp);
error:
return err;
}
@@ -499,18 +501,59 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
return esp_output_tail(x, skb, &esp);
}
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct crypto_aead *aead = x->data;
+ int alen, hlen, elen;
+ int padlen, trimlen;
+ __wsum csumdiff;
+ u8 nexthdr[2];
+ int ret;
+
+ alen = crypto_aead_authsize(aead);
+ hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ elen = skb->len - hlen;
+
+ if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+ ret = xo->proto;
+ goto out;
+ }
+
+ if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+ BUG();
+
+ ret = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen) {
+ net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+ padlen + 2, elen - alen);
+ goto out;
+ }
+
+ trimlen = alen + padlen + 2;
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+ skb->csum = csum_block_sub(skb->csum, csumdiff,
+ skb->len - trimlen);
+ }
+ pskb_trim(skb, skb->len - trimlen);
+
+ ret = nexthdr[1];
+
+out:
+ return ret;
+}
+
int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
- int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
- int elen = skb->len - hlen;
int ihl;
- u8 nexthdr[2];
- int padlen;
if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
kfree(ESP_SKB_CB(skb)->tmp);
@@ -518,16 +561,10 @@ int esp_input_done2(struct sk_buff *skb, int err)
if (unlikely(err))
goto out;
- if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
- BUG();
-
- err = -EINVAL;
- padlen = nexthdr[0];
- if (padlen + 2 + alen >= elen)
+ err = esp_remove_trailer(skb);
+ if (unlikely(err < 0))
goto out;
- /* ... check padding bits here. Silly. :-) */
-
iph = ip_hdr(skb);
ihl = iph->ihl * 4;
@@ -568,15 +605,12 @@ int esp_input_done2(struct sk_buff *skb, int err)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- pskb_trim(skb, skb->len - alen - padlen - 2);
- __skb_pull(skb, hlen);
+ skb_pull_rcsum(skb, hlen);
if (x->props.mode == XFRM_MODE_TUNNEL)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
- err = nexthdr[1];
-
/* RFC4303: Drop dummy packets without any error */
if (err == IPPROTO_NONE)
err = -EINVAL;
@@ -695,8 +729,10 @@ skip_cow:
sg_init_table(sg, nfrags);
err = skb_to_sgvec(skb, sg, 0, skb->len);
- if (unlikely(err < 0))
+ if (unlikely(err < 0)) {
+ kfree(tmp);
goto out;
+ }
skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e0666016a764..f8b918c766b0 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -182,11 +182,13 @@ out:
static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
return -EINVAL;
- skb->ip_summed = CHECKSUM_NONE;
+ if (!(xo->flags & CRYPTO_DONE))
+ skb->ip_summed = CHECKSUM_NONE;
return esp_input_done2(skb, 0);
}
@@ -257,7 +259,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
err = esp_output_tail(x, skb, &esp);
- if (err < 0)
+ if (err)
return err;
secpath_reset(skb);
@@ -303,3 +305,4 @@ module_init(esp4_offload_init);
module_exit(esp4_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 044d2a159a3c..f52d27a422c3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -73,6 +73,11 @@ fail:
fib_free_table(main_table);
return -ENOMEM;
}
+
+static bool fib4_has_custom_rules(struct net *net)
+{
+ return false;
+}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -128,6 +133,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
return NULL;
}
+
+static bool fib4_has_custom_rules(struct net *net)
+{
+ return net->ipv4.fib_has_custom_rules;
+}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
static void fib_replace_table(struct net *net, struct fib_table *old,
@@ -345,9 +355,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
- if (!rpf && !fib_num_tclassid_users(net) &&
- (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
- goto last_resort;
fib_combine_itag(itag, &res);
dev_match = false;
@@ -402,13 +409,28 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
struct in_device *idev, u32 *itag)
{
int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+ struct net *net = dev_net(dev);
- if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
- IN_DEV_ACCEPT_LOCAL(idev) &&
+ if (!r && !fib_num_tclassid_users(net) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ if (IN_DEV_ACCEPT_LOCAL(idev))
+ goto ok;
+ /* with custom local routes in place, checking local addresses
+ * only will be too optimistic, with custom rules, checking
+ * local addresses only can be too strict, e.g. due to vrf
+ */
+ if (net->ipv4.fib_has_custom_local_routes ||
+ fib4_has_custom_rules(net))
+ goto full_check;
+ if (inet_lookup_ifaddr_rcu(net, src))
+ return -EINVAL;
+
+ok:
*itag = 0;
return 0;
}
+
+full_check:
return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
}
@@ -759,6 +781,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = fib_table_insert(net, tb, &cfg, extack);
+ if (!err && cfg.fc_type == RTN_LOCAL)
+ net->ipv4.fib_has_custom_local_routes = true;
errout:
return err;
}
@@ -1247,22 +1271,28 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- net->ipv4.fib_seq = 0;
+ err = fib4_notifier_init(net);
+ if (err)
+ return err;
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
- if (!net->ipv4.fib_table_hash)
- return -ENOMEM;
+ if (!net->ipv4.fib_table_hash) {
+ err = -ENOMEM;
+ goto err_table_hash_alloc;
+ }
err = fib4_rules_init(net);
if (err < 0)
- goto fail;
+ goto err_rules_init;
return 0;
-fail:
+err_rules_init:
kfree(net->ipv4.fib_table_hash);
+err_table_hash_alloc:
+ fib4_notifier_exit(net);
return err;
}
@@ -1292,6 +1322,7 @@ static void ip_fib_net_exit(struct net *net)
#endif
rtnl_unlock();
kfree(net->ipv4.fib_table_hash);
+ fib4_notifier_exit(net);
}
static int __net_init fib_net_init(struct net *net)
@@ -1341,7 +1372,7 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 769ab87ebc4b..e6ff282bb7f4 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FIB_LOOKUP_H
#define _FIB_LOOKUP_H
@@ -32,6 +33,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
unsigned int);
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index e0714d975947..b804ccbdb241 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -1,86 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/socket.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
-
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
- enum fib_event_type event_type,
- struct fib_notifier_info *info)
+int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
{
- info->net = net;
- return nb->notifier_call(nb, event_type, info);
+ info->family = AF_INET;
+ return call_fib_notifier(nb, net, event_type, info);
}
-int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
- struct fib_notifier_info *info)
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
{
+ ASSERT_RTNL();
+
+ info->family = AF_INET;
net->ipv4.fib_seq++;
- info->net = net;
- return atomic_notifier_call_chain(&fib_chain, event_type, info);
+ return call_fib_notifiers(net, event_type, info);
}
-static unsigned int fib_seq_sum(void)
+static unsigned int fib4_seq_read(struct net *net)
{
- unsigned int fib_seq = 0;
- struct net *net;
-
- rtnl_lock();
- for_each_net(net)
- fib_seq += net->ipv4.fib_seq;
- rtnl_unlock();
+ ASSERT_RTNL();
- return fib_seq;
+ return net->ipv4.fib_seq + fib4_rules_seq_read(net);
}
-static bool fib_dump_is_consistent(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb),
- unsigned int fib_seq)
+static int fib4_dump(struct net *net, struct notifier_block *nb)
{
- atomic_notifier_chain_register(&fib_chain, nb);
- if (fib_seq == fib_seq_sum())
- return true;
- atomic_notifier_chain_unregister(&fib_chain, nb);
- if (cb)
- cb(nb);
- return false;
+ int err;
+
+ err = fib4_rules_dump(net, nb);
+ if (err)
+ return err;
+
+ fib_notify(net, nb);
+
+ return 0;
}
-#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb))
-{
- int retries = 0;
+static const struct fib_notifier_ops fib4_notifier_ops_template = {
+ .family = AF_INET,
+ .fib_seq_read = fib4_seq_read,
+ .fib_dump = fib4_dump,
+ .owner = THIS_MODULE,
+};
- do {
- unsigned int fib_seq = fib_seq_sum();
- struct net *net;
+int __net_init fib4_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
- /* Mutex semantics guarantee that every change done to
- * FIB tries before we read the change sequence counter
- * is now visible to us.
- */
- rcu_read_lock();
- for_each_net_rcu(net) {
- fib_rules_notify(net, nb);
- fib_notify(net, nb);
- }
- rcu_read_unlock();
+ net->ipv4.fib_seq = 0;
- if (fib_dump_is_consistent(nb, cb, fib_seq))
- return 0;
- } while (++retries < FIB_DUMP_MAX_RETRIES);
+ ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.notifier_ops = ops;
- return -EBUSY;
+ return 0;
}
-EXPORT_SYMBOL(register_fib_notifier);
-int unregister_fib_notifier(struct notifier_block *nb)
+void __net_exit fib4_notifier_exit(struct net *net)
{
- return atomic_notifier_chain_unregister(&fib_chain, nb);
+ fib_notifier_ops_unregister(net->ipv4.notifier_ops);
}
-EXPORT_SYMBOL(unregister_fib_notifier);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 778ecf977eb2..35d646a62ad4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -68,6 +68,16 @@ bool fib4_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib4_rule_default);
+int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return fib_rules_dump(net, nb, AF_INET);
+}
+
+unsigned int fib4_rules_seq_read(struct net *net)
+{
+ return fib_rules_seq_read(net, AF_INET);
+}
+
int __fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
@@ -185,38 +195,6 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
- enum fib_event_type event_type,
- struct fib_rule *rule)
-{
- struct fib_rule_notifier_info info = {
- .rule = rule,
- };
-
- return call_fib_notifier(nb, net, event_type, &info.info);
-}
-
-static int call_fib_rule_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib_rule *rule)
-{
- struct fib_rule_notifier_info info = {
- .rule = rule,
- };
-
- return call_fib_notifiers(net, event_type, &info.info);
-}
-
-/* Called with rcu_read_lock() */
-void fib_rules_notify(struct net *net, struct notifier_block *nb)
-{
- struct fib_rules_ops *ops = net->ipv4.rules_ops;
- struct fib_rule *rule;
-
- list_for_each_entry_rcu(rule, &ops->rules_list, list)
- call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule);
-}
-
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
FRA_GENERIC_POLICY,
[FRA_FLOW] = { .type = NLA_U32 },
@@ -273,7 +251,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule);
err = 0;
errout:
@@ -295,7 +272,6 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule);
errout:
return err;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 222100103808..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -44,6 +44,7 @@
#include <net/netlink.h>
#include <net/nexthop.h>
#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
#include "fib_lookup.h"
@@ -219,7 +220,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
} endfor_nexthops(fi);
m = fi->fib_metrics;
- if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
+ if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
kfree(m);
kfree(fi);
}
@@ -600,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi)
atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
} endfor_nexthops(fi);
}
-
-static inline void fib_add_weight(struct fib_info *fi,
- const struct fib_nh *nh)
-{
- fi->fib_weight += nh->nh_weight;
-}
-
#else /* CONFIG_IP_ROUTE_MULTIPATH */
#define fib_rebalance(fi) do { } while (0)
-#define fib_add_weight(fi, nh) do { } while (0)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -695,6 +688,40 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
return 0;
}
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
+{
+ struct nlattr *nla;
+ int remaining;
+
+ if (!cfg->fc_mx)
+ return true;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return false;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+ bool ecn_ca = false;
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
+ } else {
+ val = nla_get_u32(nla);
+ }
+
+ if (fi->fib_metrics->metrics[type - 1] != val)
+ return false;
+ }
+
+ return true;
+}
+
/*
* Picture
@@ -739,8 +766,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
* |
* |-> {local prefix} (terminal node)
*/
-static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
- struct fib_nh *nh, struct netlink_ext_ack *extack)
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
{
int err = 0;
struct net *net;
@@ -1003,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
@@ -1083,15 +1110,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (!fi)
goto failure;
- fib_info_cnt++;
if (cfg->fc_mx) {
fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
- if (!fi->fib_metrics)
- goto failure;
- atomic_set(&fi->fib_metrics->refcnt, 1);
- } else
+ if (unlikely(!fi->fib_metrics)) {
+ kfree(fi);
+ return ERR_PTR(err);
+ }
+ refcount_set(&fi->fib_metrics->refcnt, 1);
+ } else {
fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
-
+ }
+ fib_info_cnt++;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;
@@ -1221,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int linkdown = 0;
change_nexthops(fi) {
- err = fib_check_nh(cfg, fi, nexthop_nh, extack);
+ err = fib_check_nh(cfg, nexthop_nh, extack);
if (err != 0)
goto failure;
if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
@@ -1238,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
- fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
fib_rebalance(fi);
@@ -1328,8 +1356,6 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
if (fi->fib_nhs == 1) {
- struct in_device *in_dev;
-
if (fi->fib_nh->nh_gw &&
nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
goto nla_put_failure;
@@ -1337,11 +1363,17 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
goto nla_put_failure;
if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
- in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev);
if (in_dev &&
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
rtm->rtm_flags |= RTNH_F_DEAD;
+ rcu_read_unlock();
}
+ if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD)
+ rtm->rtm_flags |= RTNH_F_OFFLOAD;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
@@ -1361,18 +1393,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
for_nexthops(fi) {
- struct in_device *in_dev;
-
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
if (!rtnh)
goto nla_put_failure;
rtnh->rtnh_flags = nh->nh_flags & 0xFF;
if (nh->nh_flags & RTNH_F_LINKDOWN) {
- in_dev = __in_dev_get_rtnl(nh->nh_dev);
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(nh->nh_dev);
if (in_dev &&
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
rtnh->rtnh_flags |= RTNH_F_DEAD;
+ rcu_read_unlock();
}
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
@@ -1449,14 +1483,14 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
fib_nh->nh_flags & RTNH_F_LINKDOWN)
break;
- return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
- &info.info);
+ return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ &info.info);
case FIB_EVENT_NH_DEL:
- if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
(fib_nh->nh_flags & RTNH_F_DEAD))
- return call_fib_notifiers(dev_net(fib_nh->nh_dev),
- event_type, &info.info);
+ return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
+ event_type, &info.info);
default:
break;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 64668c69dda6..5ddc4aefff12 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -81,39 +81,40 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/fib_notifier.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type, u32 dst,
- int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
+ int dst_len, struct fib_alias *fa)
{
struct fib_entry_notifier_info info = {
.dst = dst,
.dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .tb_id = tb_id,
+ .fi = fa->fa_info,
+ .tos = fa->fa_tos,
+ .type = fa->fa_type,
+ .tb_id = fa->tb_id,
};
- return call_fib_notifier(nb, net, event_type, &info.info);
+ return call_fib4_notifier(nb, net, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
- int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
+ .info.extack = extack,
.dst = dst,
.dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .tb_id = tb_id,
+ .fi = fa->fa_info,
+ .tos = fa->fa_tos,
+ .type = fa->fa_type,
+ .tb_id = fa->tb_id,
};
- return call_fib_notifiers(net, event_type, &info.info);
+ return call_fib4_notifiers(net, event_type, &info.info);
}
#define MAX_STAT_DEPTH 32
@@ -1215,9 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_default = -1;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
- key, plen, fi,
- new_fa->fa_tos, cfg->fc_type,
- tb->tb_id);
+ key, plen, new_fa, extack);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
@@ -1272,8 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
@@ -1562,7 +1560,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi, extack) == 0) {
+ fib_nh_match(cfg, fi, extack) == 0 &&
+ fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
}
@@ -1572,8 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
return -ESRCH;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete->fa_info, tos,
- fa_to_delete->fa_type, tb->tb_id);
+ fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1890,9 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
n->key,
- KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ KEYLENGTH - fa->fa_slen, fa,
+ NULL);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1930,8 +1927,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
continue;
call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
- KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
- fa->fa_type, fa->tb_id);
+ KEYLENGTH - fa->fa_slen, fa);
}
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 8e0257d01200..1540db65241a 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -450,6 +450,7 @@ out_unlock:
out:
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, &grc);
+ skb->remcsum_offload = 0;
return pp;
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d5cac99170b1..1859c473b21a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo, gso_partial;
+ bool need_csum, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -47,20 +47,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
skb->encap_hdr_csum = need_csum;
- ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
features &= skb->dev->hw_enc_features;
- /* The only checksum offload we care about from here on out is the
- * outer one so strip the existing checksum feature flags based
- * on the fact that we will be computing our checksum in software.
- */
- if (ufo) {
- features &= ~NETIF_F_CSUM_MASK;
- if (!need_csum)
- features |= NETIF_F_HW_CSUM;
- }
-
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
@@ -98,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (gso_partial) {
+ if (gso_partial && skb_is_gso(skb)) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c2be26b98b5f..1617604c9284 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+ if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
return;
/* Needed by both icmp_global_allow and icmp_xmit_lock */
@@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
+ if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto)
}
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
* ICMP_PARAMETERPROB.
*/
@@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb)
if (iph->ihl < 5) /* Mangled header, drop. */
goto out_err;
- if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
switch (icmph->code & 15) {
case ICMP_NET_UNREACH:
case ICMP_HOST_UNREACH:
@@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb)
}
if (icmph->code > NR_ICMP_UNREACH)
goto out;
- } else if (icmph->type == ICMP_PARAMETERPROB)
+ break;
+ case ICMP_PARAMETERPROB:
info = ntohl(icmph->un.gateway) >> 24;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
+ if (icmph->code == ICMP_EXC_FRAGTIME)
+ goto out;
+ break;
+ }
/*
* Throw it at our lower layers
@@ -959,8 +968,9 @@ static bool icmp_timestamp(struct sk_buff *skb)
*/
icmp_param.data.times[1] = inet_current_timestamp();
icmp_param.data.times[2] = icmp_param.data.times[1];
- if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
- BUG();
+
+ BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
+
icmp_param.data.icmph = *icmp_hdr(skb);
icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
icmp_param.data.icmph.code = 0;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 28f14afd0dd3..d1f8f302dbf3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -752,18 +752,18 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
return ip_local_out(net, skb->sk, skb);
}
-static void igmp_gq_timer_expire(unsigned long data)
+static void igmp_gq_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = (struct in_device *)data;
+ struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
in_dev_put(in_dev);
}
-static void igmp_ifc_timer_expire(unsigned long data)
+static void igmp_ifc_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = (struct in_device *)data;
+ struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
igmpv3_send_cr(in_dev);
if (in_dev->mr_ifc_count) {
@@ -784,9 +784,9 @@ static void igmp_ifc_event(struct in_device *in_dev)
}
-static void igmp_timer_expire(unsigned long data)
+static void igmp_timer_expire(struct timer_list *t)
{
- struct ip_mc_list *im = (struct ip_mc_list *)data;
+ struct ip_mc_list *im = from_timer(im, t, timer);
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
@@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
int len = skb->len;
bool dropped = true;
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+ if (!dev)
+ goto drop;
+ }
+
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto drop;
@@ -1377,7 +1385,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
- setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+ timer_setup(&im->timer, igmp_timer_expire, 0);
im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
#endif
@@ -1687,10 +1695,8 @@ void ip_mc_init_dev(struct in_device *in_dev)
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
- setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
- (unsigned long)in_dev);
- setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
- (unsigned long)in_dev);
+ timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
+ timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
@@ -2549,7 +2555,8 @@ done:
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+ int dif, int sdif)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
@@ -2564,7 +2571,8 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
rcu_read_lock();
for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
- pmc->multi.imr_ifindex == dif)
+ (pmc->multi.imr_ifindex == dif ||
+ (sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
ret = inet->mc_all;
@@ -2974,12 +2982,6 @@ static int __net_init igmp_net_init(struct net *net)
goto out_sock;
}
- /* Sysctl initialization */
- net->ipv4.sysctl_igmp_max_memberships = 20;
- net->ipv4.sysctl_igmp_max_msf = 10;
- /* IGMP reports for link-local multicast groups are enabled by default */
- net->ipv4.sysctl_igmp_llm_reports = 1;
- net->ipv4.sysctl_igmp_qrv = 2;
return 0;
out_sock:
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4089c013cb03..4ca46dc08e63 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
* IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
* and 0.0.0.0 equals to 0.0.0.0 only
*/
-static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
- const struct in6_addr *sk2_rcv_saddr6,
- __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
- bool sk1_ipv6only, bool sk2_ipv6only,
- bool match_wildcard)
+static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+ const struct in6_addr *sk2_rcv_saddr6,
+ __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk1_ipv6only, bool sk2_ipv6only,
+ bool match_wildcard)
{
int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
- return 1;
+ return true;
if (!sk1_rcv_saddr || !sk2_rcv_saddr)
return match_wildcard;
}
- return 0;
+ return false;
}
if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
- return 1;
+ return true;
if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
!(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
- return 1;
+ return true;
if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
!(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
- return 1;
+ return true;
if (sk2_rcv_saddr6 &&
ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
- return 1;
+ return true;
- return 0;
+ return false;
}
#endif
@@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
* match_wildcard == false: addresses must be exactly the same, i.e.
* 0.0.0.0 only equals to 0.0.0.0
*/
-static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
- bool sk2_ipv6only, bool match_wildcard)
+static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk2_ipv6only, bool match_wildcard)
{
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
- return 1;
+ return true;
if (!sk1_rcv_saddr || !sk2_rcv_saddr)
return match_wildcard;
}
- return 0;
+ return false;
}
-int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
- bool match_wildcard)
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
@@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
#if IS_ENABLED(CONFIG_IPV6)
if (tb->fast_sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
- &sk->sk_v6_rcv_saddr,
+ inet6_rcv_saddr(sk),
tb->fast_rcv_saddr,
sk->sk_rcv_saddr,
tb->fast_ipv6_only,
@@ -321,13 +321,14 @@ tb_found:
goto fail_unlock;
}
success:
- if (!hlist_empty(&tb->owners)) {
+ if (hlist_empty(&tb->owners)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
+ tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
@@ -354,6 +355,7 @@ success:
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
+ tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
@@ -473,6 +475,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
}
spin_unlock_bh(&queue->fastopenq.lock);
}
+ mem_cgroup_sk_alloc(newsk);
out:
release_sock(sk);
if (req)
@@ -492,17 +495,15 @@ EXPORT_SYMBOL(inet_csk_accept);
* to optimize.
*/
void inet_csk_init_xmit_timers(struct sock *sk,
- void (*retransmit_handler)(unsigned long),
- void (*delack_handler)(unsigned long),
- void (*keepalive_handler)(unsigned long))
+ void (*retransmit_handler)(struct timer_list *t),
+ void (*delack_handler)(struct timer_list *t),
+ void (*keepalive_handler)(struct timer_list *t))
{
struct inet_connection_sock *icsk = inet_csk(sk);
- setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
- (unsigned long)sk);
- setup_timer(&icsk->icsk_delack_timer, delack_handler,
- (unsigned long)sk);
- setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+ timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+ timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+ timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
@@ -537,9 +538,11 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = read_pnet(&ireq->ireq_net);
- struct ip_options_rcu *opt = ireq->opt;
+ struct ip_options_rcu *opt;
struct rtable *rt;
+ opt = ireq_opt_deref(ireq);
+
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -573,10 +576,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct flowi4 *fl4;
struct rtable *rt;
+ opt = rcu_dereference(ireq->ireq_opt);
fl4 = &newinet->cork.fl.u.ip4;
- rcu_read_lock();
- opt = rcu_dereference(newinet->inet_opt);
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -589,13 +591,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
- rcu_read_unlock();
return &rt->dst;
route_err:
ip_rt_put(rt);
no_route:
- rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
@@ -674,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
-static void reqsk_timer_handler(unsigned long data)
+static void reqsk_timer_handler(struct timer_list *t)
{
- struct request_sock *req = (struct request_sock *)data;
+ struct request_sock *req = from_timer(req, t, rsk_timer);
struct sock *sk_listener = req->rsk_listener;
struct net *net = sock_net(sk_listener);
struct inet_connection_sock *icsk = inet_csk(sk_listener);
@@ -747,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
req->num_timeout = 0;
req->sk = NULL;
- setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
- (unsigned long)req);
+ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
inet_ehash_insert(req_to_sk(req), NULL);
@@ -916,7 +915,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
tcp_sk(child)->fastopen_rsk = NULL;
}
inet_csk_destroy_sock(child);
- reqsk_put(req);
}
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
@@ -987,6 +985,7 @@ void inet_csk_listen_stop(struct sock *sk)
sock_hold(child);
inet_child_forget(sk, req, child);
+ reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3828b3a805cd..c9c35b61a027 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -93,8 +93,17 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
-static size_t inet_sk_attr_size(void)
+static size_t inet_sk_attr_size(struct sock *sk,
+ const struct inet_diag_req_v2 *req,
+ bool net_admin)
{
+ const struct inet_diag_handler *handler;
+ size_t aux = 0;
+
+ handler = inet_diag_table[req->sdiag_protocol];
+ if (handler && handler->idiag_get_aux_size)
+ aux = handler->idiag_get_aux_size(sk, net_admin);
+
return nla_total_size(sizeof(struct tcp_info))
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
@@ -105,6 +114,7 @@ static size_t inet_sk_attr_size(void)
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ nla_total_size(TCP_CA_NAME_MAX)
+ nla_total_size(sizeof(struct tcpvegas_info))
+ + aux
+ 64;
}
@@ -260,6 +270,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
handler->idiag_get_info(sk, r, info);
+ if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux)
+ if (handler->idiag_get_aux(sk, net_admin, skb) < 0)
+ goto errout;
+
if (sk->sk_state < TCP_TIME_WAIT) {
union tcp_cc_info info;
size_t sz = 0;
@@ -274,6 +288,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout;
}
+ if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
+ u32 classid = 0;
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+
+ if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ goto errout;
+ }
+
out:
nlmsg_end(skb, nlh);
return 0;
@@ -438,6 +463,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req)
{
+ bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
struct net *net = sock_net(in_skb->sk);
struct sk_buff *rep;
struct sock *sk;
@@ -447,7 +473,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
if (IS_ERR(sk))
return PTR_ERR(sk);
- rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
+ rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
if (!rep) {
err = -ENOMEM;
goto out;
@@ -456,8 +482,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
- netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ nlh->nlmsg_seq, 0, nlh, net_admin);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 96e95e83cc61..26a3d0315728 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
spin_unlock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
- f->frag_expire((unsigned long) fq);
+ f->frag_expire(&fq->timer);
return evicted;
}
@@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work)
local_bh_disable();
- for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ for (i = READ_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
if (evicted > INETFRAGS_EVICT_MAX)
@@ -234,10 +234,8 @@ evict_again:
cond_resched();
if (read_seqretry(&f->rnd_seqlock, seq) ||
- percpu_counter_sum(&nf->mem))
+ sum_frag_mem_limit(nf))
goto evict_again;
-
- percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
@@ -368,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
f->constructor(q, arg);
add_frag_mem_limit(nf, f->qsize);
- setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+ timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
refcount_set(&q->refcnt, 1);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2e3389d614d1..e7d15fb0d94d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
- const int dif, bool exact_dif)
+ const int dif, const int sdif, bool exact_dif)
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
@@ -185,9 +185,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
score += 4;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -208,7 +212,7 @@ struct sock *__inet_lookup_listener(struct net *net,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
- const int dif)
+ const int dif, const int sdif)
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -218,7 +222,8 @@ struct sock *__inet_lookup_listener(struct net *net,
u32 phash = 0;
sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
+ score = compute_score(sk, net, hnum, daddr,
+ dif, sdif, exact_dif);
if (score > hiscore) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -268,7 +273,7 @@ struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
- const int dif)
+ const int dif, const int sdif)
{
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
@@ -286,11 +291,12 @@ begin:
if (sk->sk_hash != hash)
continue;
if (likely(INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports, dif, sdif))) {
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
if (unlikely(!INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports,
+ dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -321,9 +327,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
+ struct net *net = sock_net(sk);
+ int sdif = l3mdev_master_ifindex_by_index(net, dif);
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
- struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
@@ -339,7 +346,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
continue;
if (likely(INET_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif))) {
+ saddr, daddr, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
@@ -449,10 +456,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_add_sock(sk, sk2);
}
- /* Initial allocation may have already happened via setsockopt */
- if (!rcu_access_pointer(sk->sk_reuseport_cb))
- return reuseport_alloc(sk);
- return 0;
+ return reuseport_alloc(sk);
}
int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b039159e67a..c690cd0d9b3f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
*/
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/inet_hashtables.h>
@@ -142,9 +141,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
-static void tw_timer_handler(unsigned long data)
+static void tw_timer_handler(struct timer_list *t)
{
- struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+ struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
if (tw->tw_kill)
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
if (tw) {
const struct inet_sock *inet = inet_sk(sk);
- kmemcheck_annotate_bitfield(tw, flags);
-
tw->tw_dr = dr;
/* Give us an identity. */
tw->tw_daddr = inet->inet_daddr;
@@ -188,8 +185,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- setup_pinned_timer(&tw->tw_timer, tw_timer_handler,
- (unsigned long)tw);
+ timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index c5a117cc6619..914d56928578 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -33,7 +33,7 @@
* also be removed if the pool is overloaded i.e. if the total amount of
* entries is greater-or-equal than the threshold.
*
- * Node pool is organised as an AVL tree.
+ * Node pool is organised as an RB tree.
* Such an implementation has been chosen not just for fun. It's a way to
* prevent easy and efficient DoS attacks by creating hash collisions. A huge
* amount of long living nodes in a single hash slot would significantly delay
@@ -45,7 +45,7 @@
* AND reference count being 0.
* 3. Global variable peer_total is modified under the pool lock.
* 4. struct inet_peer fields modification:
- * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * rb_node: pool lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
@@ -53,30 +53,15 @@
static struct kmem_cache *peer_cachep __read_mostly;
-static LIST_HEAD(gc_list);
-static const int gc_delay = 60 * HZ;
-static struct delayed_work gc_work;
-static DEFINE_SPINLOCK(gc_lock);
-
-#define node_height(x) x->avl_height
-
-#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
-#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
-static const struct inet_peer peer_fake_node = {
- .avl_left = peer_avl_empty_rcu,
- .avl_right = peer_avl_empty_rcu,
- .avl_height = 0
-};
-
void inet_peer_base_init(struct inet_peer_base *bp)
{
- bp->root = peer_avl_empty_rcu;
+ bp->rb_root = RB_ROOT;
seqlock_init(&bp->lock);
bp->total = 0;
}
EXPORT_SYMBOL_GPL(inet_peer_base_init);
-#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+#define PEER_MAX_GC 32
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
@@ -84,53 +69,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
-static void inetpeer_gc_worker(struct work_struct *work)
-{
- struct inet_peer *p, *n, *c;
- struct list_head list;
-
- spin_lock_bh(&gc_lock);
- list_replace_init(&gc_list, &list);
- spin_unlock_bh(&gc_lock);
-
- if (list_empty(&list))
- return;
-
- list_for_each_entry_safe(p, n, &list, gc_list) {
-
- if (need_resched())
- cond_resched();
-
- c = rcu_dereference_protected(p->avl_left, 1);
- if (c != peer_avl_empty) {
- list_add_tail(&c->gc_list, &list);
- p->avl_left = peer_avl_empty_rcu;
- }
-
- c = rcu_dereference_protected(p->avl_right, 1);
- if (c != peer_avl_empty) {
- list_add_tail(&c->gc_list, &list);
- p->avl_right = peer_avl_empty_rcu;
- }
-
- n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
-
- if (refcount_read(&p->refcnt) == 1) {
- list_del(&p->gc_list);
- kmem_cache_free(peer_cachep, p);
- }
- }
-
- if (list_empty(&list))
- return;
-
- spin_lock_bh(&gc_lock);
- list_splice(&list, &gc_list);
- spin_unlock_bh(&gc_lock);
-
- schedule_delayed_work(&gc_work, gc_delay);
-}
-
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
{
@@ -153,225 +91,65 @@ void __init inet_initpeers(void)
sizeof(struct inet_peer),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
-
- INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
-#define rcu_deref_locked(X, BASE) \
- rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
-
-/*
- * Called with local BH disabled and the pool lock held.
- */
-#define lookup(_daddr, _stack, _base) \
-({ \
- struct inet_peer *u; \
- struct inet_peer __rcu **v; \
- \
- stackptr = _stack; \
- *stackptr++ = &_base->root; \
- for (u = rcu_deref_locked(_base->root, _base); \
- u != peer_avl_empty;) { \
- int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \
- if (cmp == 0) \
- break; \
- if (cmp == -1) \
- v = &u->avl_left; \
- else \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = rcu_deref_locked(*v, _base); \
- } \
- u; \
-})
-
-/*
- * Called with rcu_read_lock()
- * Because we hold no lock against a writer, its quite possible we fall
- * in an endless loop.
- * But every pointer we follow is guaranteed to be valid thanks to RCU.
- * We exit from this function if number of links exceeds PEER_MAXDEPTH
- */
-static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
- struct inet_peer_base *base)
+/* Called with rcu_read_lock() or base->lock held */
+static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base,
+ unsigned int seq,
+ struct inet_peer *gc_stack[],
+ unsigned int *gc_cnt,
+ struct rb_node **parent_p,
+ struct rb_node ***pp_p)
{
- struct inet_peer *u = rcu_dereference(base->root);
- int count = 0;
+ struct rb_node **pp, *parent, *next;
+ struct inet_peer *p;
- while (u != peer_avl_empty) {
- int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
+ pp = &base->rb_root.rb_node;
+ parent = NULL;
+ while (1) {
+ int cmp;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+ p = rb_entry(parent, struct inet_peer, rb_node);
+ cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
- /* Before taking a reference, check if this entry was
- * deleted (refcnt=0)
- */
- if (!refcount_inc_not_zero(&u->refcnt)) {
- u = NULL;
- }
- return u;
+ if (!refcount_inc_not_zero(&p->refcnt))
+ break;
+ return p;
+ }
+ if (gc_stack) {
+ if (*gc_cnt < PEER_MAX_GC)
+ gc_stack[(*gc_cnt)++] = p;
+ } else if (unlikely(read_seqretry(&base->lock, seq))) {
+ break;
}
if (cmp == -1)
- u = rcu_dereference(u->avl_left);
+ pp = &next->rb_left;
else
- u = rcu_dereference(u->avl_right);
- if (unlikely(++count == PEER_MAXDEPTH))
- break;
+ pp = &next->rb_right;
}
+ *parent_p = parent;
+ *pp_p = pp;
return NULL;
}
-/* Called with local BH disabled and the pool lock held. */
-#define lookup_rightempty(start, base) \
-({ \
- struct inet_peer *u; \
- struct inet_peer __rcu **v; \
- *stackptr++ = &start->avl_left; \
- v = &start->avl_left; \
- for (u = rcu_deref_locked(*v, base); \
- u->avl_right != peer_avl_empty_rcu;) { \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = rcu_deref_locked(*v, base); \
- } \
- u; \
-})
-
-/* Called with local BH disabled and the pool lock held.
- * Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas.
- */
-static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
- struct inet_peer __rcu ***stackend,
- struct inet_peer_base *base)
-{
- struct inet_peer __rcu **nodep;
- struct inet_peer *node, *l, *r;
- int lh, rh;
-
- while (stackend > stack) {
- nodep = *--stackend;
- node = rcu_deref_locked(*nodep, base);
- l = rcu_deref_locked(node->avl_left, base);
- r = rcu_deref_locked(node->avl_right, base);
- lh = node_height(l);
- rh = node_height(r);
- if (lh > rh + 1) { /* l: RH+2 */
- struct inet_peer *ll, *lr, *lrl, *lrr;
- int lrh;
- ll = rcu_deref_locked(l->avl_left, base);
- lr = rcu_deref_locked(l->avl_right, base);
- lrh = node_height(lr);
- if (lrh <= node_height(ll)) { /* ll: RH+1 */
- RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
- RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
- node->avl_height = lrh + 1; /* RH+1 or RH+2 */
- RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
- RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
- l->avl_height = node->avl_height + 1;
- RCU_INIT_POINTER(*nodep, l);
- } else { /* ll: RH, lr: RH+1 */
- lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
- lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
- RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
- RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
- node->avl_height = rh + 1; /* node: RH+1 */
- RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
- RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
- l->avl_height = rh + 1; /* l: RH+1 */
- RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
- RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
- lr->avl_height = rh + 2;
- RCU_INIT_POINTER(*nodep, lr);
- }
- } else if (rh > lh + 1) { /* r: LH+2 */
- struct inet_peer *rr, *rl, *rlr, *rll;
- int rlh;
- rr = rcu_deref_locked(r->avl_right, base);
- rl = rcu_deref_locked(r->avl_left, base);
- rlh = node_height(rl);
- if (rlh <= node_height(rr)) { /* rr: LH+1 */
- RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
- RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
- node->avl_height = rlh + 1; /* LH+1 or LH+2 */
- RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
- RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
- r->avl_height = node->avl_height + 1;
- RCU_INIT_POINTER(*nodep, r);
- } else { /* rr: RH, rl: RH+1 */
- rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
- rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
- RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
- RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
- node->avl_height = lh + 1; /* node: LH+1 */
- RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
- RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
- r->avl_height = lh + 1; /* r: LH+1 */
- RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
- RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
- rl->avl_height = lh + 2;
- RCU_INIT_POINTER(*nodep, rl);
- }
- } else {
- node->avl_height = (lh > rh ? lh : rh) + 1;
- }
- }
-}
-
-/* Called with local BH disabled and the pool lock held. */
-#define link_to_pool(n, base) \
-do { \
- n->avl_height = 1; \
- n->avl_left = peer_avl_empty_rcu; \
- n->avl_right = peer_avl_empty_rcu; \
- /* lockless readers can catch us now */ \
- rcu_assign_pointer(**--stackptr, n); \
- peer_avl_rebalance(stack, stackptr, base); \
-} while (0)
-
static void inetpeer_free_rcu(struct rcu_head *head)
{
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}
-static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
- struct inet_peer __rcu **stack[PEER_MAXDEPTH])
-{
- struct inet_peer __rcu ***stackptr, ***delp;
-
- if (lookup(&p->daddr, stack, base) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty_rcu) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p, base);
- BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->daddr > x->daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- RCU_INIT_POINTER(*delp[0], t);
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr, base);
- base->total--;
- call_rcu(&p->rcu, inetpeer_free_rcu);
-}
-
/* perform garbage collect on all items stacked during a lookup */
-static int inet_peer_gc(struct inet_peer_base *base,
- struct inet_peer __rcu **stack[PEER_MAXDEPTH],
- struct inet_peer __rcu ***stackptr)
+static void inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer *gc_stack[],
+ unsigned int gc_cnt)
{
- struct inet_peer *p, *gchead = NULL;
+ struct inet_peer *p;
__u32 delta, ttl;
- int cnt = 0;
+ int i;
if (base->total >= inet_peer_threshold)
ttl = 0; /* be aggressive */
@@ -379,43 +157,38 @@ static int inet_peer_gc(struct inet_peer_base *base,
ttl = inet_peer_maxttl
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
base->total / inet_peer_threshold * HZ;
- stackptr--; /* last stack slot is peer_avl_empty */
- while (stackptr > stack) {
- stackptr--;
- p = rcu_deref_locked(**stackptr, base);
- if (refcount_read(&p->refcnt) == 1) {
- smp_rmb();
- delta = (__u32)jiffies - p->dtime;
- if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) {
- p->gc_next = gchead;
- gchead = p;
- }
- }
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
+ delta = (__u32)jiffies - p->dtime;
+ if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
+ gc_stack[i] = NULL;
}
- while ((p = gchead) != NULL) {
- gchead = p->gc_next;
- cnt++;
- unlink_from_pool(p, base, stack);
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
+ if (p) {
+ rb_erase(&p->rb_node, &base->rb_root);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
+ }
}
- return cnt;
}
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
const struct inetpeer_addr *daddr,
int create)
{
- struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer *p;
- unsigned int sequence;
- int invalidated, gccnt = 0;
+ struct inet_peer *p, *gc_stack[PEER_MAX_GC];
+ struct rb_node **pp, *parent;
+ unsigned int gc_cnt, seq;
+ int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
rcu_read_lock();
- sequence = read_seqbegin(&base->lock);
- p = lookup_rcu(daddr, base);
- invalidated = read_seqretry(&base->lock, sequence);
+ seq = read_seqbegin(&base->lock);
+ p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
+ invalidated = read_seqretry(&base->lock, seq);
rcu_read_unlock();
if (p)
@@ -428,36 +201,31 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
+ parent = NULL;
write_seqlock_bh(&base->lock);
-relookup:
- p = lookup(daddr, stack, base);
- if (p != peer_avl_empty) {
- refcount_inc(&p->refcnt);
- write_sequnlock_bh(&base->lock);
- return p;
- }
- if (!gccnt) {
- gccnt = inet_peer_gc(base, stack, stackptr);
- if (gccnt && create)
- goto relookup;
- }
- p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
- if (p) {
- p->daddr = *daddr;
- refcount_set(&p->refcnt, 2);
- atomic_set(&p->rid, 0);
- p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
- p->rate_tokens = 0;
- /* 60*HZ is arbitrary, but chosen enough high so that the first
- * calculation of tokens is at its maximum.
- */
- p->rate_last = jiffies - 60*HZ;
- INIT_LIST_HEAD(&p->gc_list);
-
- /* Link the node. */
- link_to_pool(p, base);
- base->total++;
+
+ gc_cnt = 0;
+ p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
+ if (!p && create) {
+ p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (p) {
+ p->daddr = *daddr;
+ refcount_set(&p->refcnt, 2);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+
+ rb_link_node(&p->rb_node, parent, pp);
+ rb_insert_color(&p->rb_node, &base->rb_root);
+ base->total++;
+ }
}
+ if (gc_cnt)
+ inet_peer_gc(base, gc_stack, gc_cnt);
write_sequnlock_bh(&base->lock);
return p;
@@ -467,8 +235,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
p->dtime = (__u32)jiffies;
- smp_mb__before_atomic();
- refcount_dec(&p->refcnt);
+
+ if (refcount_dec_and_test(&p->refcnt))
+ call_rcu(&p->rcu, inetpeer_free_rcu);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
@@ -513,30 +282,19 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
-static void inetpeer_inval_rcu(struct rcu_head *head)
-{
- struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
-
- spin_lock_bh(&gc_lock);
- list_add_tail(&p->gc_list, &gc_list);
- spin_unlock_bh(&gc_lock);
-
- schedule_delayed_work(&gc_work, gc_delay);
-}
-
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- struct inet_peer *root;
+ struct rb_node *p = rb_first(&base->rb_root);
- write_seqlock_bh(&base->lock);
+ while (p) {
+ struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
- root = rcu_deref_locked(base->root, base);
- if (root != peer_avl_empty) {
- base->root = peer_avl_empty_rcu;
- base->total = 0;
- call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+ p = rb_next(p);
+ rb_erase(&peer->rb_node, &base->rb_root);
+ inet_putpeer(peer);
+ cond_resched();
}
- write_sequnlock_bh(&base->lock);
+ base->total = 0;
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 9f0a7b96646f..2dd21c3281a1 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9a8cfac503dc..bbf1b94942c0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -190,12 +191,13 @@ static bool frag_expire_skip_icmp(u32 user)
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
-static void ip_expire(unsigned long arg)
+static void ip_expire(struct timer_list *t)
{
+ struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct ipq *qp;
struct net *net;
- qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ qp = container_of(frag, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
rcu_read_lock();
@@ -844,8 +846,6 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
- int res;
-
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -871,13 +871,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
net->ipv4.frags.max_dist = 64;
- res = inet_frags_init_net(&net->ipv4.frags);
- if (res)
- return res;
- res = ip4_frags_ns_ctl_register(net);
- if (res)
- inet_frags_uninit_net(&net->ipv4.frags);
- return res;
+ inet_frags_init_net(&net->ipv4.frags);
+
+ return ip4_frags_ns_ctl_register(net);
}
static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7a7829e839c2..bb6239169b1a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -48,6 +48,7 @@
#include <net/rtnetlink.h>
#include <net/gre.h>
#include <net/dst_metadata.h>
+#include <net/erspan.h>
/*
Problems & solutions
@@ -112,9 +113,12 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
+static void erspan_build_header(struct sk_buff *skb,
+ __be32 id, u32 index, bool truncate);
static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
@@ -246,6 +250,79 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, &tpi);
}
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ int gre_hdr_len)
+{
+ struct net *net = dev_net(skb->dev);
+ struct metadata_dst *tun_dst = NULL;
+ struct ip_tunnel_net *itn;
+ struct ip_tunnel *tunnel;
+ struct erspanhdr *ershdr;
+ const struct iphdr *iph;
+ __be32 index;
+ int len;
+
+ itn = net_generic(net, erspan_net_id);
+ len = gre_hdr_len + sizeof(*ershdr);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ iph = ip_hdr(skb);
+ ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+
+ /* The original GRE header does not have key field,
+ * Use ERSPAN 10-bit session ID as key.
+ */
+ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+ index = ershdr->md.index;
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+ tpi->flags | TUNNEL_KEY,
+ iph->saddr, iph->daddr, tpi->key);
+
+ if (tunnel) {
+ if (__iptunnel_pull_header(skb,
+ gre_hdr_len + sizeof(*ershdr),
+ htons(ETH_P_TEB),
+ false, false) < 0)
+ goto drop;
+
+ if (tunnel->collect_md) {
+ struct ip_tunnel_info *info;
+ struct erspan_metadata *md;
+ __be64 tun_id;
+ __be16 flags;
+
+ tpi->flags |= TUNNEL_KEY;
+ flags = tpi->flags;
+ tun_id = key32_to_tunnel_id(tpi->key);
+
+ tun_dst = ip_tun_rx_dst(skb, flags,
+ tun_id, sizeof(*md));
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
+ if (!md)
+ return PACKET_REJECT;
+
+ md->index = index;
+ info = &tun_dst->u.tun_info;
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ info->options_len = sizeof(*md);
+ } else {
+ tunnel->index = ntohl(index);
+ }
+
+ skb_reset_mac_header(skb);
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+ return PACKET_RCVD;
+ }
+drop:
+ kfree_skb(skb);
+ return PACKET_RCVD;
+}
+
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
@@ -328,6 +405,11 @@ static int gre_rcv(struct sk_buff *skb)
if (hdr_len < 0)
goto drop;
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+ if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+ return 0;
+ }
+
if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
@@ -376,39 +458,33 @@ static struct rtable *gre_get_rt(struct sk_buff *skb,
return ip_route_output_key(net, fl);
}
-static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
- __be16 proto)
+static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl,
+ int tunnel_hlen)
{
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct rtable *rt = NULL;
- struct flowi4 fl;
int min_headroom;
- int tunnel_hlen;
- __be16 df, flags;
bool use_cache;
int err;
tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET))
- goto err_free_skb;
-
key = &tun_info->key;
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+
if (use_cache)
- rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
if (!rt) {
- rt = gre_get_rt(skb, dev, &fl, key);
+ rt = gre_get_rt(skb, dev, fl, key);
if (IS_ERR(rt))
- goto err_free_skb;
+ goto err_free_skb;
if (use_cache)
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
- fl.saddr);
+ fl->saddr);
}
- tunnel_hlen = gre_calc_hlen(key->tun_flags);
-
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr);
if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
@@ -420,6 +496,37 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
if (unlikely(err))
goto err_free_rt;
}
+ return rt;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NULL;
+}
+
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+ __be16 proto)
+{
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct rtable *rt = NULL;
+ struct flowi4 fl;
+ int tunnel_hlen;
+ __be16 df, flags;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ tunnel_hlen = gre_calc_hlen(key->tun_flags);
+
+ rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
+ if (!rt)
+ return;
/* Push Tunnel header. */
if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
@@ -442,6 +549,64 @@ err_free_skb:
dev->stats.tx_dropped++;
}
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+ __be16 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct erspan_metadata *md;
+ struct rtable *rt = NULL;
+ bool truncate = false;
+ struct flowi4 fl;
+ int tunnel_hlen;
+ __be16 df;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+
+ /* ERSPAN has fixed 8 byte GRE header */
+ tunnel_hlen = 8 + sizeof(struct erspanhdr);
+
+ rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
+ if (!rt)
+ return;
+
+ if (gre_handle_offloads(skb, false))
+ goto err_free_rt;
+
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ truncate = true;
+ }
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (!md)
+ goto err_free_rt;
+
+ erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+ ntohl(md->index), truncate);
+
+ gre_build_header(skb, 8, TUNNEL_SEQ,
+ htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
+
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+
+ iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
+ return;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+}
+
static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -503,6 +668,86 @@ free_skb:
return NETDEV_TX_OK;
}
+static inline u8 tos_to_cos(u8 tos)
+{
+ u8 dscp, cos;
+
+ dscp = tos >> 2;
+ cos = dscp >> 3;
+ return cos;
+}
+
+static void erspan_build_header(struct sk_buff *skb,
+ __be32 id, u32 index, bool truncate)
+{
+ struct iphdr *iphdr = ip_hdr(skb);
+ struct ethhdr *eth = eth_hdr(skb);
+ enum erspan_encap_type enc_type;
+ struct erspanhdr *ershdr;
+ struct qtag_prefix {
+ __be16 eth_type;
+ __be16 tci;
+ } *qp;
+ u16 vlan_tci = 0;
+
+ enc_type = ERSPAN_ENCAP_NOVLAN;
+
+ /* If mirrored packet has vlan tag, extract tci and
+ * perserve vlan header in the mirrored frame.
+ */
+ if (eth->h_proto == htons(ETH_P_8021Q)) {
+ qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+ vlan_tci = ntohs(qp->tci);
+ enc_type = ERSPAN_ENCAP_INFRAME;
+ }
+
+ skb_push(skb, sizeof(*ershdr));
+ ershdr = (struct erspanhdr *)skb->data;
+ memset(ershdr, 0, sizeof(*ershdr));
+
+ ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+ (ERSPAN_VERSION << VER_OFFSET));
+ ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+ ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
+ (enc_type << EN_OFFSET & EN_MASK) |
+ ((truncate << T_OFFSET) & T_MASK));
+ ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ bool truncate = false;
+
+ if (tunnel->collect_md) {
+ erspan_fb_xmit(skb, dev, skb->protocol);
+ return NETDEV_TX_OK;
+ }
+
+ if (gre_handle_offloads(skb, false))
+ goto free_skb;
+
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ truncate = true;
+ }
+
+ /* Push ERSPAN header */
+ erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+ tunnel->parms.o_flags &= ~TUNNEL_KEY;
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+ return NETDEV_TX_OK;
+
+free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -528,20 +773,46 @@ free_skb:
return NETDEV_TX_OK;
}
+static void ipgre_link_update(struct net_device *dev, bool set_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int len;
+
+ len = tunnel->tun_hlen;
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ len = tunnel->tun_hlen - len;
+ tunnel->hlen = tunnel->hlen + len;
+
+ dev->needed_headroom = dev->needed_headroom + len;
+ if (set_mtu)
+ dev->mtu = max_t(int, dev->mtu - len, 68);
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
+ tunnel->encap.type == TUNNEL_ENCAP_NONE) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
static int ipgre_tunnel_ioctl(struct net_device *dev,
struct ifreq *ifr, int cmd)
{
- int err;
struct ip_tunnel_parm p;
+ int err;
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
return -EFAULT;
+
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
+ ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
}
+
p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
@@ -549,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
if (err)
return err;
+ if (cmd == SIOCCHGTUNNEL) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
+
+ if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+ ipgre_link_update(dev, true);
+ }
+
p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
return -EFAULT;
+
return 0;
}
@@ -766,15 +1048,14 @@ static int __net_init ipgre_init_net(struct net *net)
return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
-static void __net_exit ipgre_exit_net(struct net *net)
+static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
{
- struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
- ip_tunnel_delete_net(itn, &ipgre_link_ops);
+ ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
- .exit = ipgre_exit_net,
+ .exit_batch = ipgre_exit_batch_net,
.id = &ipgre_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -828,6 +1109,42 @@ out:
return ipgre_tunnel_validate(tb, data, extack);
}
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags = 0;
+ int ret;
+
+ if (!data)
+ return 0;
+
+ ret = ipgre_tap_validate(tb, data, extack);
+ if (ret)
+ return ret;
+
+ /* ERSPAN should only have GRE sequence and key flag */
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (!data[IFLA_GRE_COLLECT_METADATA] &&
+ flags != (GRE_SEQ | GRE_KEY))
+ return -EINVAL;
+
+ /* ERSPAN Session ID only has 10-bit. Since we reuse
+ * 32-bit key field as ID, check it's range.
+ */
+ if (data[IFLA_GRE_IKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_OKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ return 0;
+}
+
static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
@@ -892,6 +1209,13 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+ if (t->index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -933,6 +1257,7 @@ static int gre_tap_init(struct net_device *dev)
{
__gre_tunnel_init(dev);
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
return ip_tunnel_init(dev);
}
@@ -949,6 +1274,39 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
+static int erspan_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen;
+
+ tunnel->tun_hlen = 8;
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+ sizeof(struct erspanhdr);
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+ dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
+
+ return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+ .ndo_init = erspan_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = erspan_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
+};
+
static void ipgre_tap_setup(struct net_device *dev)
{
ether_setup(dev);
@@ -986,9 +1344,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
__u32 fwmark = t->fwmark;
+ struct ip_tunnel_parm p;
int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
@@ -1001,7 +1359,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_changelink(dev, tb, &p, fwmark);
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
+
+ if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
}
static size_t ipgre_get_size(const struct net_device *dev)
@@ -1041,6 +1410,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(1) +
/* IFLA_GRE_FWMARK */
nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_INDEX */
+ nla_total_size(4) +
0;
}
@@ -1083,12 +1454,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
+ if (t->index)
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
return -EMSGSIZE;
}
+static void erspan_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &erspan_netdev_ops;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, erspan_net_id);
+}
+
static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_LINK] = { .type = NLA_U32 },
[IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
@@ -1107,6 +1491,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
[IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
[IFLA_GRE_FWMARK] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1139,6 +1524,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+ .kind = "erspan",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = erspan_setup,
+ .validate = erspan_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
u8 name_assign_type)
{
@@ -1189,19 +1589,36 @@ static int __net_init ipgre_tap_init_net(struct net *net)
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
-static void __net_exit ipgre_tap_exit_net(struct net *net)
+static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
{
- struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
- ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+ ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
}
static struct pernet_operations ipgre_tap_net_ops = {
.init = ipgre_tap_init_net,
- .exit = ipgre_tap_exit_net,
+ .exit_batch = ipgre_tap_exit_batch_net,
.id = &gre_tap_net_id,
.size = sizeof(struct ip_tunnel_net),
};
+static int __net_init erspan_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, erspan_net_id,
+ &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
+{
+ ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+ .init = erspan_init_net,
+ .exit_batch = erspan_exit_batch_net,
+ .id = &erspan_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
static int __init ipgre_init(void)
{
int err;
@@ -1214,7 +1631,11 @@ static int __init ipgre_init(void)
err = register_pernet_device(&ipgre_tap_net_ops);
if (err < 0)
- goto pnet_tap_faied;
+ goto pnet_tap_failed;
+
+ err = register_pernet_device(&erspan_net_ops);
+ if (err < 0)
+ goto pnet_erspan_failed;
err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
@@ -1230,15 +1651,23 @@ static int __init ipgre_init(void)
if (err < 0)
goto tap_ops_failed;
+ err = rtnl_link_register(&erspan_link_ops);
+ if (err < 0)
+ goto erspan_link_failed;
+
return 0;
+erspan_link_failed:
+ rtnl_link_unregister(&ipgre_tap_ops);
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
+ unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
-pnet_tap_faied:
+pnet_tap_failed:
unregister_pernet_device(&ipgre_net_ops);
return err;
}
@@ -1247,9 +1676,11 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
+ rtnl_link_unregister(&erspan_link_ops);
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
+ unregister_pernet_device(&erspan_net_ops);
}
module_init(ipgre_init);
@@ -1257,5 +1688,7 @@ module_exit(ipgre_fini);
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
MODULE_ALIAS_NETDEV("gre0");
MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fa2dc8f692c6..57fc13c6ab2b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -311,9 +311,10 @@ drop:
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt;
+ int (*edemux)(struct sk_buff *skb);
struct net_device *dev = skb->dev;
- void (*edemux)(struct sk_buff *skb);
+ struct rtable *rt;
+ int err;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
- edemux(skb);
+ err = edemux(skb);
+ if (unlikely(err))
+ goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (!skb_valid_dst(skb)) {
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, dev);
- if (unlikely(err)) {
- if (err == -EXDEV)
- __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
- goto drop;
- }
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, dev);
+ if (unlikely(err))
+ goto drop_error;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
drop:
kfree_skb(skb);
return NET_RX_DROP;
+
+drop_error:
+ if (err == -EXDEV)
+ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ goto drop;
}
/*
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 93157f2f4758..ed194d46c00e 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -86,8 +87,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
* NOTE: dopt cannot point to skb.
*/
-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
- const struct ip_options *sopt)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+ struct sk_buff *skb, const struct ip_options *sopt)
{
unsigned char *sptr, *dptr;
int soffset, doffset;
@@ -140,7 +141,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
__be32 addr;
memcpy(&addr, dptr+soffset-1, 4);
- if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+ if (inet_addr_type(net, addr) != RTN_UNICAST) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -174,9 +175,6 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
doffset -= 4;
}
if (doffset > 3) {
- __be32 daddr = fib_compute_spec_dst(skb);
-
- memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
dptr[1] = doffset+3;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 50c74cd890bc..e8e675be60ec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -853,61 +853,6 @@ csum_page(struct page *page, int offset, int copy)
return csum;
}
-static inline int ip_ufo_append_data(struct sock *sk,
- struct sk_buff_head *queue,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int maxfraglen, unsigned int flags)
-{
- struct sk_buff *skb;
- int err;
-
- /* There is support for UDP fragmentation offload by network
- * device, so create one single skb packet containing complete
- * udp datagram
- */
- skb = skb_peek_tail(queue);
- if (!skb) {
- skb = sock_alloc_send_skb(sk,
- hh_len + fragheaderlen + transhdrlen + 20,
- (flags & MSG_DONTWAIT), &err);
-
- if (!skb)
- return err;
-
- /* reserve space for Hardware header */
- skb_reserve(skb, hh_len);
-
- /* create space for UDP/IP header */
- skb_put(skb, fragheaderlen + transhdrlen);
-
- /* initialize network header pointer */
- skb_reset_network_header(skb);
-
- /* initialize protocol header pointer */
- skb->transport_header = skb->network_header + fragheaderlen;
-
- skb->csum = 0;
-
- if (flags & MSG_CONFIRM)
- skb_set_dst_pending_confirm(skb, 1);
-
- __skb_queue_tail(queue, skb);
- } else if (skb_is_gso(skb)) {
- goto append;
- }
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- /* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-
-append:
- return skb_append_datato_frags(sk, skb, getfrag, from,
- (length - transhdrlen));
-}
-
static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
struct sk_buff_head *queue,
@@ -965,18 +910,6 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) ||
- (skb && skb_is_gso(skb))) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
- (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
- err = ip_ufo_append_data(sk, queue, getfrag, from, length,
- hh_len, fragheaderlen, transhdrlen,
- maxfraglen, flags);
- if (err)
- goto error;
- return 0;
- }
/* So, what's going on in the loop below?
*
@@ -1287,27 +1220,14 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (!skb)
return -EINVAL;
- if ((size + skb->len > mtu) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO)) {
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- return -EOPNOTSUPP;
-
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- }
cork->length += size;
while (size > 0) {
- if (skb_is_gso(skb)) {
- len = size;
- } else {
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
- /* Check if the remaining data fits into current packet. */
- len = mtu - skb->len;
- if (len < size)
- len = maxfraglen - skb->len;
- }
if (len <= 0) {
struct sk_buff *skb_prev;
int alloclen;
@@ -1601,7 +1521,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
int err;
int oif;
- if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
+ if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
return;
ipc.addr = daddr;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ecc4b4a2413e..60fb1eb7d7d8 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -80,7 +81,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
}
-static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
+ struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
@@ -88,7 +90,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
if (IPCB(skb)->opt.optlen == 0)
return;
- if (ip_options_echo(opt, skb)) {
+ if (ip_options_echo(net, opt, skb)) {
msg->msg_flags |= MSG_CTRUNC;
return;
}
@@ -204,7 +206,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
}
if (flags & IP_CMSG_RETOPTS) {
- ip_cmsg_recv_retopts(msg, skb);
+ ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
flags &= ~IP_CMSG_RETOPTS;
if (!flags)
@@ -1219,22 +1221,20 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* (e.g., process binds socket to eth0 for Tx which is
* redirected to loopback in the rtable/dst).
*/
+ struct rtable *rt = skb_rtable(skb);
+ bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+
if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
pktinfo->ipi_ifindex = inet_iif(skb);
+ else if (l3slave && rt && rt->rt_iif)
+ pktinfo->ipi_ifindex = rt->rt_iif;
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- /* We need to keep the dst for __ip_options_echo()
- * We could restrict the test to opt.ts_needtime || opt.srr,
- * but the following is good enough as IP options are not often used.
- */
- if (unlikely(IPCB(skb)->opt.optlen))
- skb_dst_force(skb);
- else
- skb_dst_drop(skb);
+ skb_dst_drop(skb);
}
int ip_setsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 129d1a3616f8..fe6fee728ce4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -176,7 +176,7 @@ skip_key_lookup:
return cand;
t = rcu_dereference(itn->collect_md_tun);
- if (t)
+ if (t && t->dev->flags & IFF_UP)
return t;
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
ip_rt_put(rt);
goto tx_dropped;
}
- iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
- key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;
@@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
}
}
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
+ struct rtnl_link_ops *ops)
{
+ struct ip_tunnel_net *itn;
+ struct net *net;
LIST_HEAD(list);
rtnl_lock();
- ip_tunnel_destroy(itn, &list, ops);
+ list_for_each_entry(net, net_list, exit_list) {
+ itn = net_generic(net, id);
+ ip_tunnel_destroy(itn, &list, ops);
+ }
unregister_netdevice_many(&list);
rtnl_unlock();
}
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p, __u32 fwmark)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 0192c255e508..949f432a5f04 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
struct ip_tunnel_parm *parms = &tunnel->parms;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
+ int pkt_len = skb->len;
int err;
int mtu;
@@ -197,15 +198,6 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
-
mtu = dst_mtu(dst);
if (skb->len > mtu) {
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
@@ -229,7 +221,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
- err = skb->len;
+ err = pkt_len;
iptunnel_xmit_stats(dev, err);
return NETDEV_TX_OK;
@@ -452,15 +444,14 @@ static int __net_init vti_init_net(struct net *net)
return 0;
}
-static void __net_exit vti_exit_net(struct net *net)
+static void __net_exit vti_exit_batch_net(struct list_head *list_net)
{
- struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
- ip_tunnel_delete_net(itn, &vti_link_ops);
+ ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
}
static struct pernet_operations vti_net_ops = {
.init = vti_init_net,
- .exit = vti_exit_net,
+ .exit_batch = vti_exit_batch_net,
.id = &vti_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -584,33 +575,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
-static bool is_vti_tunnel(const struct net_device *dev)
-{
- return dev->netdev_ops == &vti_netdev_ops;
-}
-
-static int vti_device_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct ip_tunnel *tunnel = netdev_priv(dev);
-
- if (!is_vti_tunnel(dev))
- return NOTIFY_DONE;
-
- switch (event) {
- case NETDEV_DOWN:
- if (!net_eq(tunnel->net, dev_net(dev)))
- xfrm_garbage_collect(tunnel->net);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block vti_notifier_block __read_mostly = {
- .notifier_call = vti_device_event,
-};
-
static int __init vti_init(void)
{
const char *msg;
@@ -618,8 +582,6 @@ static int __init vti_init(void)
pr_info("IPv4 over IPsec tunneling driver\n");
- register_netdevice_notifier(&vti_notifier_block);
-
msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
@@ -652,7 +614,6 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
pernet_dev_failed:
- unregister_netdevice_notifier(&vti_notifier_block);
pr_err("vti init: failed to register %s\n", msg);
return err;
}
@@ -664,7 +625,6 @@ static void __exit vti_fini(void)
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
- unregister_netdevice_notifier(&vti_notifier_block);
}
module_init(vti_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 4c5dfe6bd34d..abdebca848c9 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
* user-supplied information to configure own IP address and routes.
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index fb1ad22b5e29..c891235b4966 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -128,43 +128,68 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly;
static int ipip_err(struct sk_buff *skb, u32 info)
{
-
-/* All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
- */
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ */
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
const struct iphdr *iph = (const struct iphdr *)skb->data;
- struct ip_tunnel *t;
- int err;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+ int err = 0;
+
+ switch (type) {
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ /* Impossible event. */
+ goto out;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ * rfc2003 contains "deep thoughts" about NET_UNREACH,
+ * I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ goto out;
+ break;
+
+ case ICMP_REDIRECT:
+ break;
+
+ default:
+ goto out;
+ }
- err = -ENOENT;
t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->daddr, iph->saddr, 0);
- if (!t)
+ if (!t) {
+ err = -ENOENT;
goto out;
+ }
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, iph->protocol, 0);
- err = 0;
+ ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
+ iph->protocol, 0);
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- iph->protocol, 0);
- err = 0;
+ ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
goto out;
}
- if (t->parms.iph.daddr == 0)
+ if (t->parms.iph.daddr == 0) {
+ err = -ENOENT;
goto out;
+ }
- err = 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
@@ -634,15 +659,14 @@ static int __net_init ipip_init_net(struct net *net)
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
-static void __net_exit ipip_exit_net(struct net *net)
+static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
{
- struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
- ip_tunnel_delete_net(itn, &ipip_link_ops);
+ ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
- .exit = ipip_exit_net,
+ .exit_batch = ipip_exit_batch_net,
.id = &ipip_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 06863ea3fc5b..fd5f19c988e4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,7 @@
#include <net/fib_rules.h>
#include <linux/netconf.h>
#include <net/nexthop.h>
+#include <net/switchdev.h>
struct ipmr_rule {
struct fib_rule common;
@@ -111,7 +112,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
static void mroute_clean_tables(struct mr_table *mrt, bool all);
-static void ipmr_expire_process(unsigned long arg);
+static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net) \
@@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
fib_rules_unregister(net->ipv4.mr_rules_ops);
rtnl_unlock();
}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+ return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
#else
#define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
net->ipv4.mrt = NULL;
rtnl_unlock();
}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+ return 0;
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
#endif
static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -342,8 +375,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
- setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
- (unsigned long)mrt);
+ timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
mrt->mroute_reg_vif_num = -1;
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -587,6 +619,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
}
#endif
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+ struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ vifi_t vif_index, u32 tb_id)
+{
+ struct vif_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .dev = vif->dev,
+ .vif_index = vif_index,
+ .vif_flags = vif->flags,
+ .tb_id = tb_id,
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ vifi_t vif_index, u32 tb_id)
+{
+ struct vif_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .dev = vif->dev,
+ .vif_index = vif_index,
+ .vif_flags = vif->flags,
+ .tb_id = tb_id,
+ };
+
+ ASSERT_RTNL();
+ net->ipv4.ipmr_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+ struct net *net,
+ enum fib_event_type event_type,
+ struct mfc_cache *mfc, u32 tb_id)
+{
+ struct mfc_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .mfc = mfc,
+ .tb_id = tb_id
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct mfc_cache *mfc, u32 tb_id)
+{
+ struct mfc_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .mfc = mfc,
+ .tb_id = tb_id
+ };
+
+ ASSERT_RTNL();
+ net->ipv4.ipmr_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
/**
* vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
@@ -594,6 +702,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
{
+ struct net *net = read_pnet(&mrt->net);
struct vif_device *v;
struct net_device *dev;
struct in_device *in_dev;
@@ -603,6 +712,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
+ if (VIF_EXISTS(mrt, vifi))
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
+ mrt->id);
+
write_lock_bh(&mrt_lock);
dev = v->dev;
v->dev = NULL;
@@ -652,10 +765,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
kmem_cache_free(mrt_cachep, c);
}
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
{
call_rcu(&c->rcu, ipmr_cache_free_rcu);
}
+EXPORT_SYMBOL(ipmr_cache_free);
/* Destroy an unresolved cache entry, killing queued skbs
* and reporting error to netlink readers.
@@ -689,9 +803,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
}
/* Timer process for the unresolved queue. */
-static void ipmr_expire_process(unsigned long arg)
+static void ipmr_expire_process(struct timer_list *t)
{
- struct mr_table *mrt = (struct mr_table *)arg;
+ struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
unsigned long now;
unsigned long expires;
struct mfc_cache *c, *next;
@@ -754,6 +868,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
struct vifctl *vifc, int mrtsock)
{
int vifi = vifc->vifc_vifi;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ };
struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
@@ -828,6 +945,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
/* Fill in the VIF structures */
+ attr.orig_dev = dev;
+ if (!switchdev_port_attr_get(dev, &attr)) {
+ memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
+ v->dev_parent_id.id_len = attr.u.ppid.id_len;
+ } else {
+ v->dev_parent_id.id_len = 0;
+ }
v->rate_limit = vifc->vifc_rate_limit;
v->local = vifc->vifc_lcl_addr.s_addr;
v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -851,6 +975,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
if (vifi+1 > mrt->maxvif)
mrt->maxvif = vifi+1;
write_unlock_bh(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
return 0;
}
@@ -949,6 +1074,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
if (c) {
c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXVIFS;
+ refcount_set(&c->mfc_un.res.refcount, 1);
}
return c;
}
@@ -1150,6 +1276,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
+ struct net *net = read_pnet(&mrt->net);
struct mfc_cache *c;
/* The entries are added/deleted only under RTNL */
@@ -1161,8 +1288,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
return -ENOENT;
rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
list_del_rcu(&c->list);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
+ ipmr_cache_put(c);
return 0;
}
@@ -1189,6 +1317,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+ mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1238,6 +1368,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
ipmr_cache_resolve(net, mrt, uc, c);
ipmr_cache_free(uc);
}
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1245,6 +1376,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
+ struct net *net = read_pnet(&mrt->net);
struct mfc_cache *c, *tmp;
LIST_HEAD(list);
int i;
@@ -1263,8 +1395,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
continue;
rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
list_del_rcu(&c->list);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c,
+ mrt->id);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
+ ipmr_cache_put(c);
}
if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
@@ -1393,6 +1527,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
+ /* fall through */
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc)) {
@@ -1724,10 +1859,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
+#ifdef CONFIG_NET_SWITCHDEV
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ struct vif_device *out_vif = &mrt->vif_table[out_vifi];
+ struct vif_device *in_vif = &mrt->vif_table[in_vifi];
+
+ if (!skb->offload_mr_fwd_mark)
+ return false;
+ if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
+ return false;
+ return netdev_phys_item_id_same(&out_vif->dev_parent_id,
+ &in_vif->dev_parent_id);
+}
+#else
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ return false;
+}
+#endif
+
/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *c, int vifi)
+ int in_vifi, struct sk_buff *skb,
+ struct mfc_cache *c, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
@@ -1748,6 +1906,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
goto out_free;
}
+ if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+ goto out_free;
+
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
vif->remote, vif->local,
@@ -1925,8 +2086,8 @@ forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, skb2, cache,
- psend);
+ ipmr_queue_xmit(net, mrt, true_vifi,
+ skb2, cache, psend);
}
psend = ct;
}
@@ -1937,9 +2098,10 @@ last_forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb2,
+ cache, psend);
} else {
- ipmr_queue_xmit(net, mrt, skb, cache, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend);
return;
}
}
@@ -2156,6 +2318,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
return -EMSGSIZE;
+ if (c->mfc_flags & MFC_OFFLOAD)
+ rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
return -EMSGSIZE;
@@ -3048,14 +3213,87 @@ static const struct net_protocol pim_protocol = {
};
#endif
+static unsigned int ipmr_seq_read(struct net *net)
+{
+ ASSERT_RTNL();
+
+ return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_rules_dump(net, nb);
+ if (err)
+ return err;
+
+ ipmr_for_each_table(mrt, net) {
+ struct vif_device *v = &mrt->vif_table[0];
+ struct mfc_cache *mfc;
+ int vifi;
+
+ /* Notifiy on table VIF entries */
+ read_lock(&mrt_lock);
+ for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+ if (!v->dev)
+ continue;
+
+ call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+ v, vifi, mrt->id);
+ }
+ read_unlock(&mrt_lock);
+
+ /* Notify on table MFC entries */
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ call_ipmr_mfc_entry_notifier(nb, net,
+ FIB_EVENT_ENTRY_ADD, mfc,
+ mrt->id);
+ }
+
+ return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .fib_seq_read = ipmr_seq_read,
+ .fib_dump = ipmr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ net->ipv4.ipmr_seq = 0;
+
+ ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.ipmr_notifier_ops = ops;
+
+ return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+ net->ipv4.ipmr_notifier_ops = NULL;
+}
+
/* Setup for IP multicast routing */
static int __net_init ipmr_net_init(struct net *net)
{
int err;
+ err = ipmr_notifier_init(net);
+ if (err)
+ goto ipmr_notifier_fail;
+
err = ipmr_rules_init(net);
if (err < 0)
- goto fail;
+ goto ipmr_rules_fail;
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
@@ -3072,7 +3310,9 @@ proc_cache_fail:
proc_vif_fail:
ipmr_rules_exit(net);
#endif
-fail:
+ipmr_rules_fail:
+ ipmr_notifier_exit(net);
+ipmr_notifier_fail:
return err;
}
@@ -3082,6 +3322,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_cache", net->proc_net);
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
+ ipmr_notifier_exit(net);
ipmr_rules_exit(net);
}
@@ -3114,14 +3355,14 @@ int __init ip_mr_init(void)
}
#endif
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
- ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL);
+ ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
- ipmr_rtm_route, NULL, NULL);
+ ipmr_rtm_route, NULL, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
- ipmr_rtm_route, NULL, NULL);
+ ipmr_rtm_route, NULL, 0);
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
- NULL, ipmr_rtm_dumplink, NULL);
+ NULL, ipmr_rtm_dumplink, 0);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index f462fee66ac8..adcdae358365 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the netfilter modules on top of IPv4.
#
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 0bc3c3d73e61..f88221aebc9d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -268,14 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
- /* Target might have changed stuff. */
- arp = arp_hdr(skb);
-
- if (verdict == XT_CONTINUE)
+ if (verdict == XT_CONTINUE) {
+ /* Target might have changed stuff. */
+ arp = arp_hdr(skb);
e = arpt_next_entry(e);
- else
+ } else {
/* Verdict */
break;
+ }
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
@@ -629,7 +629,27 @@ static void get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
+ cond_resched();
+ }
+ }
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct arpt_entry *iter;
+ unsigned int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i;
}
+ cond_resched();
}
}
@@ -909,8 +929,7 @@ static int __do_replace(struct net *net, const char *name,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters, and synchronize with replace */
- get_counters(oldinfo, counters);
+ get_old_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries;
@@ -1117,7 +1136,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
struct xt_table_info *newinfo, unsigned char *base)
{
struct xt_entry_target *t;
- struct xt_target *target;
struct arpt_entry *de;
unsigned int origsize;
int h;
@@ -1132,7 +1150,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
de->target_offset = e->target_offset - (origsize - *size);
t = compat_arpt_get_target(e);
- target = t->u.kernel.target;
xt_compat_target_from_user(t, dstptr, size);
de->next_offset = e->next_offset - (origsize - *size);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2a55a40211cb..4cbe5e80f3bf 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -35,12 +35,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x) WARN_ON(!(x))
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
void *ipt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ipt, IPT);
@@ -151,7 +145,7 @@ static const char *const comments[] = {
[NF_IP_TRACE_COMMENT_POLICY] = "policy",
};
-static struct nf_loginfo trace_loginfo = {
+static const struct nf_loginfo trace_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
@@ -263,7 +257,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.hotdrop = false;
acpar.state = state;
- IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+ WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
@@ -293,7 +287,7 @@ ipt_do_table(struct sk_buff *skb,
const struct xt_entry_match *ematch;
struct xt_counters *counter;
- IP_NF_ASSERT(e);
+ WARN_ON(!e);
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
@@ -312,7 +306,7 @@ ipt_do_table(struct sk_buff *skb,
ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target(e);
- IP_NF_ASSERT(t->u.kernel.target);
+ WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
@@ -352,13 +346,14 @@ ipt_do_table(struct sk_buff *skb,
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
- /* Target might have changed stuff. */
- ip = ip_hdr(skb);
- if (verdict == XT_CONTINUE)
+ if (verdict == XT_CONTINUE) {
+ /* Target might have changed stuff. */
+ ip = ip_hdr(skb);
e = ipt_next_entry(e);
- else
+ } else {
/* Verdict */
break;
+ }
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
@@ -781,10 +776,31 @@ get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */
+ cond_resched();
}
}
}
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ipt_entry *iter;
+ unsigned int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ const struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i; /* macro does multi eval of i */
+ }
+
+ cond_resched();
+ }
+}
+
static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
@@ -1074,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters, and synchronize with replace */
- get_counters(oldinfo, counters);
+ get_old_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
@@ -1355,7 +1370,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
struct xt_table_info *newinfo, unsigned char *base)
{
struct xt_entry_target *t;
- struct xt_target *target;
struct ipt_entry *de;
unsigned int origsize;
int h;
@@ -1374,7 +1388,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
de->target_offset = e->target_offset - (origsize - *size);
t = compat_ipt_get_target(e);
- target = t->u.kernel.target;
xt_compat_target_from_user(t, dstptr, size);
de->next_offset = e->next_offset - (origsize - *size);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 7d72decb80f9..17b4ca562944 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -117,7 +117,8 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
- proc_remove(c->pde);
+ if (cn->procdir)
+ proc_remove(c->pde);
#endif
return;
}
@@ -624,7 +625,7 @@ arp_mangle(void *priv,
return NF_ACCEPT;
}
-static struct nf_hook_ops cip_arp_ops __read_mostly = {
+static const struct nf_hook_ops cip_arp_ops = {
.hook = arp_mangle,
.pf = NFPROTO_ARP,
.hooknum = NF_ARP_OUT,
@@ -815,6 +816,7 @@ static void clusterip_net_exit(struct net *net)
#ifdef CONFIG_PROC_FS
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
proc_remove(cn->procdir);
+ cn->procdir = NULL;
#endif
nf_unregister_net_hook(net, &cip_arp_ops);
}
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index f1528f7175a8..f75fc6b53115 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
if (synproxy == NULL)
return NF_ACCEPT;
- if (nf_is_loopback_packet(skb))
+ if (nf_is_loopback_packet(skb) ||
+ ip_hdr(skb)->protocol != IPPROTO_TCP)
return NF_ACCEPT;
thoff = ip_hdrlen(skb);
@@ -416,7 +417,7 @@ static unsigned int ipv4_synproxy_hook(void *priv,
return NF_ACCEPT;
}
-static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+static const struct nf_hook_ops ipv4_synproxy_ops[] = {
{
.hook = ipv4_synproxy_hook,
.pf = NFPROTO_IPV4,
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 138a24bc76ad..a1a07b338ccf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -67,7 +67,7 @@ static unsigned int iptable_nat_ipv4_local_fn(void *priv,
return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
}
-static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 2e14ed11a35c..89af9d88ca21 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -63,13 +63,6 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
return true;
}
-static void ipv4_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- seq_printf(s, "src=%pI4 dst=%pI4 ",
- &tuple->src.u3.ip, &tuple->dst.u3.ip);
-}
-
static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
unsigned int *dataoff, u_int8_t *protonum)
{
@@ -174,7 +167,7 @@ static unsigned int ipv4_conntrack_local(void *priv,
/* Connection tracking may drop packets, but never alters them, so
make it the first hook. */
-static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
+static const struct nf_hook_ops ipv4_conntrack_ops[] = {
{
.hook = ipv4_conntrack_in,
.pf = NFPROTO_IPV4,
@@ -303,11 +296,6 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
return 0;
}
-
-static int ipv4_nlattr_tuple_size(void)
-{
- return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
-}
#endif
static struct nf_sockopt_ops so_getorigdst = {
@@ -356,18 +344,17 @@ static void ipv4_hooks_unregister(struct net *net)
mutex_unlock(&register_ipv4_hooks);
}
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
.l3proto = PF_INET,
- .name = "ipv4",
.pkt_to_tuple = ipv4_pkt_to_tuple,
.invert_tuple = ipv4_invert_tuple,
- .print_tuple = ipv4_print_tuple,
.get_l4proto = ipv4_get_l4proto,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = ipv4_tuple_to_nlattr,
- .nlattr_tuple_size = ipv4_nlattr_tuple_size,
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
.nla_policy = ipv4_nla_policy,
+ .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
+ NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */
#endif
.net_ns_get = ipv4_hooks_register,
.net_ns_put = ipv4_hooks_unregister,
@@ -398,24 +385,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
static int ipv4_net_init(struct net *net)
{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- if (ret < 0)
- return ret;
- ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: pernet registration failed\n");
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- }
- return ret;
+ return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
static void ipv4_net_exit(struct net *net)
{
- nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
ARRAY_SIZE(builtin_l4proto4));
}
@@ -433,6 +408,11 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
need_conntrack();
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
+ nf_conntrack_l3proto_ipv4.nla_size))
+ return -EINVAL;
+#endif
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
pr_err("Unable to register netfilter socket option\n");
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 73c591d8a9a8..1849fedd9b81 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -71,16 +71,6 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
return true;
}
-/* Print out the per-protocol part of the tuple. */
-static void icmp_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- seq_printf(s, "type=%u code=%u id=%u ",
- tuple->dst.u.icmp.type,
- tuple->dst.u.icmp.code,
- ntohs(tuple->src.u.icmp.id));
-}
-
static unsigned int *icmp_get_timeouts(struct net *net)
{
return &icmp_pernet(net)->timeout;
@@ -91,8 +81,6 @@ static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- u_int8_t pf,
- unsigned int hooknum,
unsigned int *timeout)
{
/* Do not immediately delete the connection after the first
@@ -137,7 +125,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
- NF_CT_ASSERT(!skb_nfct(skb));
+ WARN_ON(skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
@@ -176,6 +164,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return NF_ACCEPT;
}
+static void icmp_error_log(const struct sk_buff *skb, struct net *net,
+ u8 pf, const char *msg)
+{
+ nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
+}
+
/* Small and modified version of icmp_rcv */
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
@@ -188,18 +182,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
/* Not enough header? */
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
- NULL, "nf_ct_icmp: short packet ");
+ icmp_error_log(skb, net, pf, "short packet");
return -NF_ACCEPT;
}
/* See ip_conntrack_proto_tcp.c */
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: bad HW ICMP checksum ");
+ icmp_error_log(skb, net, pf, "bad hw icmp checksum");
return -NF_ACCEPT;
}
@@ -210,9 +200,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
* discarded.
*/
if (icmph->type > NR_ICMP_TYPES) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: invalid ICMP type ");
+ icmp_error_log(skb, net, pf, "invalid icmp type");
return -NF_ACCEPT;
}
@@ -270,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[],
return 0;
}
-static int icmp_nlattr_tuple_size(void)
+static unsigned int icmp_nlattr_tuple_size(void)
{
- return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+ static unsigned int size __read_mostly;
+
+ if (!size)
+ size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+
+ return size;
}
#endif
@@ -362,10 +355,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_ICMP,
- .name = "icmp",
.pkt_to_tuple = icmp_pkt_to_tuple,
.invert_tuple = icmp_invert_tuple,
- .print_tuple = icmp_print_tuple,
.packet = icmp_packet,
.get_timeouts = icmp_get_timeouts,
.new = icmp_new,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 346bf7ccac08..37fe1616ca0b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -90,7 +90,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
return NF_ACCEPT;
}
-static struct nf_hook_ops ipv4_defrag_ops[] = {
+static const struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
.pf = NFPROTO_IPV4,
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 2f3895ddc275..df5c2a2061a4 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -25,7 +25,7 @@
#include <linux/netfilter/xt_LOG.h>
#include <net/netfilter/nf_log.h>
-static struct nf_loginfo default_loginfo = {
+static const struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index c83a9963269b..4388de0e5380 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -24,7 +24,7 @@
#include <linux/netfilter/xt_LOG.h>
#include <net/netfilter/nf_log.h>
-static struct nf_loginfo default_loginfo = {
+static const struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 574f7ebba0b6..ac8342dcb55e 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -252,16 +252,16 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
if (set_h245_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
htons((port & htons(1)) ? nated_port + 1 :
- nated_port)) == 0) {
- /* Save ports */
- info->rtp_port[i][dir] = rtp_port;
- info->rtp_port[i][!dir] = htons(nated_port);
- } else {
+ nated_port))) {
nf_ct_unexpect_related(rtp_exp);
nf_ct_unexpect_related(rtcp_exp);
return -1;
}
+ /* Save ports */
+ info->rtp_port[i][dir] = rtp_port;
+ info->rtp_port[i][!dir] = htons(nated_port);
+
/* Success */
pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
&rtp_exp->tuple.src.u3.ip,
@@ -370,15 +370,15 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
/* Modify signal */
if (set_h225_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
- htons(nated_port)) == 0) {
- /* Save ports */
- info->sig_port[dir] = port;
- info->sig_port[!dir] = htons(nated_port);
- } else {
+ htons(nated_port))) {
nf_ct_unexpect_related(exp);
return -1;
}
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
&exp->tuple.src.u3.ip,
ntohs(exp->tuple.src.u.tcp.port),
@@ -462,24 +462,27 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
/* Modify signal */
if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
&ct->tuplehash[!dir].tuple.dst.u3,
- htons(nated_port)) == 0) {
- /* Save ports */
- info->sig_port[dir] = port;
- info->sig_port[!dir] = htons(nated_port);
-
- /* Fix for Gnomemeeting */
- if (idx > 0 &&
- get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
- (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
- set_h225_addr(skb, protoff, data, 0, &taddr[0],
- &ct->tuplehash[!dir].tuple.dst.u3,
- info->sig_port[!dir]);
- }
- } else {
+ htons(nated_port))) {
nf_ct_unexpect_related(exp);
return -1;
}
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
+ /* Fix for Gnomemeeting */
+ if (idx > 0 &&
+ get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[0],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir])) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+ }
+
/* Success */
pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
&exp->tuple.src.u3.ip,
@@ -550,9 +553,9 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
}
/* Modify signal */
- if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
- &ct->tuplehash[!dir].tuple.dst.u3,
- htons(nated_port)) == 0) {
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port))) {
nf_ct_unexpect_related(exp);
return -1;
}
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index feedd759ca80..0443ca4120b0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -190,7 +190,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
struct nf_conntrack_tuple target;
unsigned long statusbit;
- NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+ WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
return 0;
@@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
else
return NF_ACCEPT;
}
- /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ /* Only ICMPs can be IP_CT_IS_REPLY: */
+ /* fall through */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
@@ -306,8 +307,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
default:
/* ESTABLISHED */
- NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
- ctinfo == IP_CT_ESTABLISHED_REPLY);
+ WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY);
if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index f39037fca923..0c366aad89cb 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -34,12 +34,12 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
const struct rtable *rt;
__be32 newsrc, nh;
- NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+ WARN_ON(hooknum != NF_INET_POST_ROUTING);
ct = nf_ct_get(skb, &ctinfo);
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY));
+ WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY)));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
@@ -96,7 +96,7 @@ static int masq_device_event(struct notifier_block *this,
* conntracks which were associated with that device,
* and forget them.
*/
- NF_CT_ASSERT(dev->ifindex != 0);
+ WARN_ON(dev->ifindex == 0);
nf_ct_iterate_cleanup_net(net, device_cmp,
(void *)(long)dev->ifindex, 0, 0);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index eeacbdaf7cdf..5cd06ba3535d 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -132,6 +132,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
+ niph = ip_hdr(nskb);
+
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
goto free_nskb;
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index de3681df2ce7..e50976e3c213 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -32,9 +32,10 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
u32 *dst = &regs->data[priv->dreg];
const struct net_device *dev = NULL;
- const struct iphdr *iph;
+ struct iphdr *iph, _iph;
__be32 addr;
if (priv->flags & NFTA_FIB_F_IIF)
@@ -42,7 +43,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
else if (priv->flags & NFTA_FIB_F_OIF)
dev = nft_out(pkt);
- iph = ip_hdr(pkt->skb);
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
if (priv->flags & NFTA_FIB_F_DADDR)
addr = iph->daddr;
else
@@ -61,8 +67,9 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
u32 *dest = &regs->data[priv->dreg];
- const struct iphdr *iph;
+ struct iphdr *iph, _iph;
struct fib_result res;
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
@@ -95,7 +102,12 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- iph = ip_hdr(pkt->skb);
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
if (ipv4_is_zeronet(iph->saddr)) {
if (ipv4_is_lbcast(iph->daddr) ||
ipv4_is_local_multicast(iph->daddr)) {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 43eb6567b3a0..9f37c4727861 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,18 +206,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
- SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
- SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
- SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
- SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
- SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
- SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
@@ -230,14 +224,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
- SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
- SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b0bb5d0a30bd..33b70bfd1122 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -122,7 +122,8 @@ void raw_unhash_sk(struct sock *sk)
EXPORT_SYMBOL_GPL(raw_unhash_sk);
struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
- unsigned short num, __be32 raddr, __be32 laddr, int dif)
+ unsigned short num, __be32 raddr, __be32 laddr,
+ int dif, int sdif)
{
sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
@@ -130,7 +131,8 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -171,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
*/
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
+ int sdif = inet_sdif(skb);
struct sock *sk;
struct hlist_head *head;
int delivered = 0;
@@ -184,13 +187,13 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
net = dev_net(skb->dev);
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ skb->dev->ifindex, sdif);
while (sk) {
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
- skb->dev->ifindex)) {
+ skb->dev->ifindex, sdif)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
@@ -199,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
}
sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ skb->dev->ifindex, sdif);
}
out:
read_unlock(&raw_v4_hashinfo.lock);
@@ -297,12 +300,15 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_lock(&raw_v4_hashinfo.lock);
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
if (raw_sk) {
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+
iph = (const struct iphdr *)skb->data;
net = dev_net(skb->dev);
while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
iph->daddr, iph->saddr,
- skb->dev->ifindex)) != NULL) {
+ dif, sdif)) != NULL) {
raw_err(raw_sk, skb, info);
raw_sk = sk_next(raw_sk);
iph = (const struct iphdr *)skb->data;
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index e1a51ca68d23..c200065ef9a5 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -46,13 +46,13 @@ static struct sock *raw_lookup(struct net *net, struct sock *from,
sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
r->id.idiag_dst[0],
r->id.idiag_src[0],
- r->id.idiag_if);
+ r->id.idiag_if, 0);
#if IS_ENABLED(CONFIG_IPV6)
else
sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
(const struct in6_addr *)r->id.idiag_src,
(const struct in6_addr *)r->id.idiag_dst,
- r->id.idiag_if);
+ r->id.idiag_if, 0);
#endif
return sk;
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0383e66f59bc..43b69af242e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs)
{
u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(*p_tstamp);
+ u32 old = READ_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
u32 new, delta = 0;
@@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
struct rtable *rt;
+ u32 genid, hval;
unsigned int i;
int depth;
- u32 hval = fnhe_hashfun(daddr);
+
+ genid = fnhe_genid(dev_net(nh->nh_dev));
+ hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
@@ -676,12 +679,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
}
if (fnhe) {
+ if (fnhe->fnhe_genid != genid)
+ fnhe->fnhe_genid = genid;
if (gw)
fnhe->fnhe_gw = gw;
- if (pmtu) {
+ if (pmtu)
fnhe->fnhe_pmtu = pmtu;
- fnhe->fnhe_expires = max(1UL, expires);
- }
+ fnhe->fnhe_expires = max(1UL, expires);
/* Update all cached dsts too */
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt)
@@ -700,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_next = hash->chain;
rcu_assign_pointer(hash->chain, fnhe);
}
- fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+ fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
@@ -1250,7 +1254,7 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
- unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
+ unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
ip_rt_min_advmss);
return min(advmss, IPV4_MAX_PMTU - header_size);
@@ -1267,7 +1271,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
if (mtu)
return mtu;
- mtu = dst->dev->mtu;
+ mtu = READ_ONCE(dst->dev->mtu);
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
if (rt->rt_uses_gateway && mtu > 576)
@@ -1398,7 +1402,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *) dst;
- if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+ if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
kfree(p);
if (!list_empty(&rt->rt_uncached)) {
@@ -1456,7 +1460,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
if (fi->fib_metrics != &dst_default_metrics) {
rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- atomic_inc(&fi->fib_metrics->refcnt);
+ refcount_inc(&fi->fib_metrics->refcnt);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
@@ -1520,43 +1524,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
EXPORT_SYMBOL(rt_dst_alloc);
/* called in rcu_read_lock() section */
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, int our)
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct in_device *in_dev, u32 *itag)
{
- struct rtable *rth;
- struct in_device *in_dev = __in_dev_get_rcu(dev);
- unsigned int flags = RTCF_MULTICAST;
- u32 itag = 0;
int err;
/* Primary sanity checks. */
-
if (!in_dev)
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
skb->protocol != htons(ETH_P_IP))
- goto e_inval;
+ return -EINVAL;
if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
- goto e_inval;
+ return -EINVAL;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
- goto e_inval;
+ return -EINVAL;
} else {
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
- in_dev, &itag);
+ in_dev, itag);
if (err < 0)
- goto e_err;
+ return err;
}
+ return 0;
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, int our)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
+ struct rtable *rth;
+ u32 itag = 0;
+ int err;
+
+ err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
+ if (err)
+ return err;
+
if (our)
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
- goto e_nobufs;
+ return -ENOBUFS;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
@@ -1572,13 +1589,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb_dst_set(skb, &rth->dst);
return 0;
-
-e_nobufs:
- return -ENOBUFS;
-e_inval:
- return -EINVAL;
-e_err:
- return err;
}
@@ -2236,7 +2246,7 @@ add:
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->rt_iif = orig_oif ? : 0;
+ rth->rt_iif = orig_oif;
if (res->table)
rth->rt_table_id = res->table->tb_id;
@@ -2439,6 +2449,12 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
/* L3 master device is the loopback for that domain */
dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
net->loopback_dev;
+
+ /* make sure orig_oif points to fib result device even
+ * though packet rx/tx happens over loopback or l3mdev
+ */
+ orig_oif = FIB_RES_OIF(*res);
+
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2501,7 +2517,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
struct rtable *ort = (struct rtable *) dst_orig;
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
@@ -2750,26 +2766,34 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
err = 0;
if (IS_ERR(rt))
err = PTR_ERR(rt);
+ else
+ skb_dst_set(skb, &rt->dst);
}
if (err)
goto errout_free;
- skb_dst_set(skb, &rt->dst);
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = rt->rt_table_id;
- if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+ if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ if (!res.fi) {
+ err = fib_props[res.type].error;
+ if (!err)
+ err = -EHOSTUNREACH;
+ goto errout_free;
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);
- else
+ } else {
err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ }
if (err < 0)
goto errout_free;
@@ -3018,7 +3042,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
int __init ip_rt_init(void)
{
- int rc = 0;
int cpu;
ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
@@ -3067,14 +3090,15 @@ int __init ip_rt_init(void)
xfrm_init();
xfrm4_init();
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
register_pernet_subsys(&rt_genid_ops);
register_pernet_subsys(&ipv4_inetpeer_ops);
- return rc;
+ return 0;
}
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 03ad8778c395..fda37f2862c9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
- ireq->opt = tcp_v4_save_options(skb);
+ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb));
if (security_inet_conn_request(sk, skb, req)) {
reqsk_free(req);
@@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* Try to redo what tcp_v4_send_synack did. */
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
- tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9bf809726066..93e172118a94 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
*
@@ -25,6 +26,7 @@
#include <net/inet_frag.h>
#include <net/ping.h>
#include <net/protocol.h>
+#include <net/netevent.h>
static int zero;
static int one = 1;
@@ -45,6 +47,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+/* obsolete */
+static int sysctl_tcp_low_latency __read_mostly;
+
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
@@ -196,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = container_of(ctl->data, struct net,
+ ipv4.tcp_congestion_control);
char val[TCP_CA_NAME_MAX];
struct ctl_table tbl = {
.data = val,
@@ -203,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
};
int ret;
- tcp_get_default_congestion_control(val);
+ tcp_get_default_congestion_control(net, val);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0)
- ret = tcp_set_default_congestion_control(val);
+ ret = tcp_set_default_congestion_control(net, val);
return ret;
}
@@ -248,10 +255,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen);
struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
struct tcp_fastopen_context *ctxt;
int ret;
@@ -262,7 +271,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
return -ENOMEM;
rcu_read_lock();
- ctxt = rcu_dereference(tcp_fastopen_ctx);
+ ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctxt)
memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
else
@@ -279,12 +288,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
ret = -EINVAL;
goto bad_key;
}
- /* Generate a dummy secret but don't publish it. This
- * is needed so we don't regenerate a new key on the
- * first invocation of tcp_fastopen_cookie_gen
- */
- tcp_fastopen_init_key_once(false);
- tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+ tcp_fastopen_reset_cipher(net, NULL, user_key,
+ TCP_FASTOPEN_KEY_LENGTH);
}
bad_key:
@@ -355,11 +360,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen_blackhole_timeout);
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
- tcp_fastopen_active_timeout_reset();
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
return ret;
}
@@ -382,15 +389,25 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
return ret;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_fib_multipath_hash_policy);
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+#endif
+
static struct ctl_table ipv4_table[] = {
{
- .procname = "tcp_retrans_collapse",
- .data = &sysctl_tcp_retrans_collapse,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_orphans",
.data = &sysctl_tcp_max_orphans,
.maxlen = sizeof(int),
@@ -398,48 +415,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_fastopen",
- .data = &sysctl_tcp_fastopen,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_fastopen_key",
- .mode = 0600,
- .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
- .proc_handler = proc_tcp_fastopen_key,
- },
- {
- .procname = "tcp_fastopen_blackhole_timeout_sec",
- .data = &sysctl_tcp_fastopen_blackhole_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_tfo_blackhole_detect_timeout,
- .extra1 = &zero,
- },
- {
- .procname = "tcp_abort_on_overflow",
- .data = &sysctl_tcp_abort_on_overflow,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_stdurg",
- .data = &sysctl_tcp_stdurg,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_rfc1337",
- .data = &sysctl_tcp_rfc1337,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -461,34 +436,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "tcp_fack",
- .data = &sysctl_tcp_fack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_recovery",
- .data = &sysctl_tcp_recovery,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_max_reordering",
- .data = &sysctl_tcp_max_reordering,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_dsack",
- .data = &sysctl_tcp_dsack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_mem",
.maxlen = sizeof(sysctl_tcp_mem),
.data = &sysctl_tcp_mem,
@@ -496,113 +443,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
- .procname = "tcp_wmem",
- .data = &sysctl_tcp_wmem,
- .maxlen = sizeof(sysctl_tcp_wmem),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- },
- {
- .procname = "tcp_rmem",
- .data = &sysctl_tcp_rmem,
- .maxlen = sizeof(sysctl_tcp_rmem),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- },
- {
- .procname = "tcp_app_win",
- .data = &sysctl_tcp_app_win,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_adv_win_scale",
- .data = &sysctl_tcp_adv_win_scale,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &tcp_adv_win_scale_min,
- .extra2 = &tcp_adv_win_scale_max,
- },
- {
- .procname = "tcp_frto",
- .data = &sysctl_tcp_frto,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_min_rtt_wlen",
- .data = &sysctl_tcp_min_rtt_wlen,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
- {
- .procname = "tcp_no_metrics_save",
- .data = &sysctl_tcp_nometrics_save,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_moderate_rcvbuf",
- .data = &sysctl_tcp_moderate_rcvbuf,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_tso_win_divisor",
- .data = &sysctl_tcp_tso_win_divisor,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_congestion_control",
- .mode = 0644,
- .maxlen = TCP_CA_NAME_MAX,
- .proc_handler = proc_tcp_congestion_control,
- },
- {
- .procname = "tcp_workaround_signed_windows",
- .data = &sysctl_tcp_workaround_signed_windows,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_limit_output_bytes",
- .data = &sysctl_tcp_limit_output_bytes,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_challenge_ack_limit",
- .data = &sysctl_tcp_challenge_ack_limit,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_slow_start_after_idle",
- .data = &sysctl_tcp_slow_start_after_idle,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
#ifdef CONFIG_NETLABEL
{
.procname = "cipso_cache_enable",
@@ -646,65 +492,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
- .procname = "tcp_thin_linear_timeouts",
- .data = &sysctl_tcp_thin_linear_timeouts,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_early_retrans",
- .data = &sysctl_tcp_early_retrans,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &four,
- },
- {
- .procname = "tcp_min_tso_segs",
- .data = &sysctl_tcp_min_tso_segs,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &gso_max_segs,
- },
- {
- .procname = "tcp_pacing_ss_ratio",
- .data = &sysctl_tcp_pacing_ss_ratio,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &thousand,
- },
- {
- .procname = "tcp_pacing_ca_ratio",
- .data = &sysctl_tcp_pacing_ca_ratio,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &thousand,
- },
- {
- .procname = "tcp_autocorking",
- .data = &sysctl_tcp_autocorking,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
- },
- {
- .procname = "tcp_invalid_ratelimit",
- .data = &sysctl_tcp_invalid_ratelimit,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
- {
.procname = "tcp_available_ulp",
.maxlen = TCP_ULP_BUF_MAX,
.mode = 0444,
@@ -973,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = {
},
#endif
{
+ .procname = "tcp_congestion_control",
+ .data = &init_net.ipv4.tcp_congestion_control,
+ .mode = 0644,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = proc_tcp_congestion_control,
+ },
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
@@ -1082,6 +876,28 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_fastopen",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
+ .procname = "tcp_fastopen_blackhole_timeout_sec",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tfo_blackhole_detect_timeout,
+ .extra1 = &zero,
+ },
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
.procname = "fib_multipath_use_neigh",
@@ -1097,7 +913,7 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_fib_multipath_hash_policy,
.extra1 = &zero,
.extra2 = &one,
},
@@ -1141,6 +957,216 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_early_retrans",
+ .data = &init_net.ipv4.sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &four,
+ },
+ {
+ .procname = "tcp_recovery",
+ .data = &init_net.ipv4.sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_thin_linear_timeouts",
+ .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_slow_start_after_idle",
+ .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_retrans_collapse",
+ .data = &init_net.ipv4.sysctl_tcp_retrans_collapse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_stdurg",
+ .data = &init_net.ipv4.sysctl_tcp_stdurg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_rfc1337",
+ .data = &init_net.ipv4.sysctl_tcp_rfc1337,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_abort_on_overflow",
+ .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fack",
+ .data = &init_net.ipv4.sysctl_tcp_fack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_reordering",
+ .data = &init_net.ipv4.sysctl_tcp_max_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dsack",
+ .data = &init_net.ipv4.sysctl_tcp_dsack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_app_win",
+ .data = &init_net.ipv4.sysctl_tcp_app_win,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_adv_win_scale",
+ .data = &init_net.ipv4.sysctl_tcp_adv_win_scale,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_adv_win_scale_min,
+ .extra2 = &tcp_adv_win_scale_max,
+ },
+ {
+ .procname = "tcp_frto",
+ .data = &init_net.ipv4.sysctl_tcp_frto,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_no_metrics_save",
+ .data = &init_net.ipv4.sysctl_tcp_nometrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_moderate_rcvbuf",
+ .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_tso_win_divisor",
+ .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_workaround_signed_windows",
+ .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_limit_output_bytes",
+ .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_challenge_ack_limit",
+ .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_min_tso_segs",
+ .data = &init_net.ipv4.sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &gso_max_segs,
+ },
+ {
+ .procname = "tcp_min_rtt_wlen",
+ .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_autocorking",
+ .data = &init_net.ipv4.sysctl_tcp_autocorking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "tcp_invalid_ratelimit",
+ .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "tcp_pacing_ss_ratio",
+ .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
+ .procname = "tcp_pacing_ca_ratio",
+ .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
+ .procname = "tcp_wmem",
+ .data = &init_net.ipv4.sysctl_tcp_wmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_rmem",
+ .data = &init_net.ipv4.sysctl_tcp_rmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..bf97317e6c97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,6 +269,8 @@
#include <linux/err.h>
#include <linux/time.h>
#include <linux/slab.h>
+#include <linux/errqueue.h>
+#include <linux/static_key.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -281,24 +283,22 @@
#include <asm/ioctls.h>
#include <net/busy_poll.h>
-int sysctl_tcp_min_tso_segs __read_mostly = 2;
-
-int sysctl_tcp_autocorking __read_mostly = 1;
+#include <trace/events/tcp.h>
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
-
EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
/*
* Current number of TCP sockets.
*/
@@ -388,6 +388,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
return period;
}
+static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
+{
+ u32 rate = READ_ONCE(tp->rate_delivered);
+ u32 intv = READ_ONCE(tp->rate_interval_us);
+ u64 rate64 = 0;
+
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ }
+ return rate64;
+}
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -399,9 +412,10 @@ void tcp_init_sock(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
+ sk->tcp_rtx_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
+ INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -428,6 +442,7 @@ void tcp_init_sock(struct sock *sk)
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
+ tp->rack.reo_wnd_steps = 1;
sk->sk_state = TCP_CLOSE;
@@ -436,15 +451,29 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk_sockets_allocated_inc(sk);
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+void tcp_init_transfer(struct sock *sk, int bpf_op)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_mtup_init(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_metrics(sk);
+ tcp_call_bpf(sk, bpf_op);
+ tcp_init_congestion_control(sk);
+ tcp_init_buffer_space(sk);
+}
+
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -662,7 +691,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
- sysctl_tcp_autocorking &&
+ sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
skb != tcp_write_queue_head(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
@@ -673,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- if (!tcp_send_head(sk))
- return;
-
skb = tcp_write_queue_tail(sk);
+ if (!skb)
+ return;
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
@@ -856,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
* available to the caller, no more, no less.
*/
skb->reserved_tailroom = skb->end - skb->tail - size;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
@@ -935,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ if (!skb || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_memory;
@@ -1014,7 +1043,7 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sk->sk_tsflags);
if (!(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
@@ -1034,23 +1063,29 @@ out_err:
}
EXPORT_SYMBOL_GPL(do_tcp_sendpages);
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
- ssize_t res;
-
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!sk_check_csum_caps(sk))
- return sock_no_sendpage(sk->sk_socket, page, offset, size,
- flags);
-
- lock_sock(sk);
+ return sock_no_sendpage_locked(sk, page, offset, size, flags);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
- res = do_tcp_sendpages(sk, page, offset, size, flags);
+ return do_tcp_sendpages(sk, page, offset, size, flags);
+}
+EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendpage_locked(sk, page, offset, size, flags);
release_sock(sk);
- return res;
+
+ return ret;
}
EXPORT_SYMBOL(tcp_sendpage);
@@ -1107,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1144,9 +1179,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return err;
}
-int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
@@ -1155,9 +1191,25 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
bool sg;
long timeo;
- lock_sock(sk);
-
flags = msg->msg_flags;
+
+ if (flags & MSG_ZEROCOPY && size) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ skb = tcp_write_queue_tail(sk);
+ uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+
+ if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+ uarg->zerocopy = 0;
+ }
+
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
@@ -1223,7 +1275,7 @@ restart:
int max = size_goal;
skb = tcp_write_queue_tail(sk);
- if (tcp_send_head(sk)) {
+ if (skb) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
@@ -1243,7 +1295,7 @@ new_segment:
process_backlog = false;
goto restart;
}
- first_skb = skb_queue_empty(&sk->sk_write_queue);
+ first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg, first_skb),
sk->sk_allocation,
@@ -1281,7 +1333,7 @@ new_segment:
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1319,6 +1371,13 @@ new_segment:
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
+ } else {
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ if (err == -EMSGSIZE || err == -EEXIST)
+ goto new_segment;
+ if (err < 0)
+ goto do_error;
+ copy = err;
}
if (!copied)
@@ -1361,11 +1420,11 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sockc.tsflags);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- release_sock(sk);
+ sock_zerocopy_put(uarg);
return copied + copied_syn;
do_fault:
@@ -1382,6 +1441,7 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
+ sock_zerocopy_put_abort(uarg);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
@@ -1389,9 +1449,20 @@ out_err:
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
- release_sock(sk);
return err;
}
+EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendmsg_locked(sk, msg, size);
+ release_sock(sk);
+
+ return ret;
+}
EXPORT_SYMBOL(tcp_sendmsg);
/*
@@ -1450,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
/* XXX -- need to support SO_PEEK_OFF */
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ return err;
+ copied += skb->len;
+ }
+
skb_queue_walk(&sk->sk_write_queue, skb) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
@@ -1525,20 +1603,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
-static void tcp_prequeue_process(struct sock *sk)
-{
- struct sk_buff *skb;
- struct tcp_sock *tp = tcp_sk(sk);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb);
-
- /* Clear memory counter. */
- tp->ucopy.memory = 0;
-}
-
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1652,6 +1716,61 @@ int tcp_peek_len(struct socket *sock)
}
EXPORT_SYMBOL(tcp_peek_len);
+static void tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping *tss)
+{
+ if (skb->tstamp)
+ tss->ts[0] = ktime_to_timespec(skb->tstamp);
+ else
+ tss->ts[0] = (struct timespec) {0};
+
+ if (skb_hwtstamps(skb)->hwtstamp)
+ tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+ else
+ tss->ts[2] = (struct timespec) {0};
+}
+
+/* Similar to __sock_recv_timestamp, but does not require an skb */
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping *tss)
+{
+ struct timeval tv;
+ bool has_timestamping = false;
+
+ if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
+ if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+ if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+ sizeof(tss->ts[0]), &tss->ts[0]);
+ } else {
+ tv.tv_sec = tss->ts[0].tv_sec;
+ tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+ sizeof(tv), &tv);
+ }
+ }
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
+ has_timestamping = true;
+ else
+ tss->ts[0] = (struct timespec) {0};
+ }
+
+ if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
+ has_timestamping = true;
+ else
+ tss->ts[2] = (struct timespec) {0};
+ }
+
+ if (has_timestamping) {
+ tss->ts[1] = (struct timespec) {0};
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
+ sizeof(*tss), tss);
+ }
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -1671,9 +1790,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int err;
int target; /* Read at least this many bytes */
long timeo;
- struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
+ struct scm_timestamping tss;
+ bool has_tss = false;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -1806,51 +1926,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_cleanup_rbuf(sk, copied);
- if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
- /* Install new reader */
- if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
- user_recv = current;
- tp->ucopy.task = user_recv;
- tp->ucopy.msg = msg;
- }
-
- tp->ucopy.len = len;
-
- WARN_ON(tp->copied_seq != tp->rcv_nxt &&
- !(flags & (MSG_PEEK | MSG_TRUNC)));
-
- /* Ugly... If prequeue is not empty, we have to
- * process it before releasing socket, otherwise
- * order will be broken at second iteration.
- * More elegant solution is required!!!
- *
- * Look: we have the following (pseudo)queues:
- *
- * 1. packets in flight
- * 2. backlog
- * 3. prequeue
- * 4. receive_queue
- *
- * Each queue can be processed only if the next ones
- * are empty. At this point we have empty receive_queue.
- * But prequeue _can_ be not empty after 2nd iteration,
- * when we jumped to start of loop because backlog
- * processing added something to receive_queue.
- * We cannot release_sock(), because backlog contains
- * packets arrived _after_ prequeued ones.
- *
- * Shortly, algorithm is clear --- to process all
- * the queues in order. We could make it more directly,
- * requeueing packets from backlog to prequeue, if
- * is not empty. It is more elegant, but eats cycles,
- * unfortunately.
- */
- if (!skb_queue_empty(&tp->ucopy.prequeue))
- goto do_prequeue;
-
- /* __ Set realtime policy in scheduler __ */
- }
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
@@ -1859,31 +1934,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
sk_wait_data(sk, &timeo, last);
}
- if (user_recv) {
- int chunk;
-
- /* __ Restore normal policy in scheduler __ */
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
- len -= chunk;
- copied += chunk;
- }
-
- if (tp->rcv_nxt == tp->copied_seq &&
- !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
- tcp_prequeue_process(sk);
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
- }
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1941,6 +1991,10 @@ skip_copy:
if (used + offset < skb->len)
continue;
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, &tss);
+ has_tss = true;
+ }
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
@@ -1955,29 +2009,13 @@ skip_copy:
break;
} while (len > 0);
- if (user_recv) {
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- int chunk;
-
- tp->ucopy.len = copied > 0 ? len : 0;
-
- tcp_prequeue_process(sk);
-
- if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
-
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- }
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
+ if (has_tss)
+ tcp_recv_timestamp(msg, sk, &tss);
+
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
@@ -2002,6 +2040,8 @@ void tcp_set_state(struct sock *sk, int state)
{
int oldstate = sk->sk_state;
+ trace_tcp_set_state(sk, oldstate, state);
+
switch (state) {
case TCP_ESTABLISHED:
if (oldstate != TCP_ESTABLISHED)
@@ -2289,6 +2329,37 @@ static inline bool tcp_need_reset(int state)
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+ struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ /* Since we are deleting whole queue, no need to
+ * list_del(&skb->tcp_tsorted_anchor)
+ */
+ tcp_rtx_queue_unlink(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ sk_wmem_free_skb(sk, skb);
+ }
+ tcp_rtx_queue_purge(sk);
+ INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+ sk_mem_reclaim(sk);
+ tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -2347,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags)
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
- tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
dst_release(sk->sk_rx_dst);
@@ -2439,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
return -EINVAL;
tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
- if (sysctl_tcp_fack)
- tcp_enable_fack(tp);
break;
case TCPOPT_TIMESTAMP:
if (opt.opt_val != 0)
@@ -2481,7 +2549,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true);
+ err = tcp_set_congestion_control(sk, name, true, true);
release_sock(sk);
return err;
}
@@ -2503,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_FASTOPEN_KEY: {
+ __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ if (optlen != sizeof(key))
+ return -EINVAL;
+
+ if (copy_from_user(key, optval, optlen))
+ return -EFAULT;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ }
default:
/* fallthru */
break;
@@ -2734,7 +2813,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
- tcp_fastopen_init_key_once(true);
+ tcp_fastopen_init_key_once(net);
fastopen_queue_tune(sk, val);
} else {
@@ -2744,7 +2823,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
@@ -2753,6 +2832,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EOPNOTSUPP;
}
break;
+ case TCP_FASTOPEN_NO_COOKIE:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ err = -EINVAL;
+ else
+ tp->fastopen_no_cookie = val;
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -2823,7 +2910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now, intv;
+ u32 now;
u64 rate64;
bool slow;
u32 rate;
@@ -2890,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
- info->tcpi_fackets = tp->fackets_out;
now = tcp_jiffies32;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
@@ -2922,13 +3008,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
- rate = READ_ONCE(tp->rate_delivered);
- intv = READ_ONCE(tp->rate_interval_us);
- if (rate && intv) {
- rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
- do_div(rate64, intv);
+ rate64 = tcp_compute_delivery_rate(tp);
+ if (rate64)
info->tcpi_delivery_rate = rate64;
- }
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2938,8 +3020,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ u64 rate64;
+ u32 rate;
- stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
+ 3 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -2954,6 +3040,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->data_segs_out, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
tp->total_retrans, TCP_NLA_PAD);
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+ rate64 = tcp_compute_delivery_rate(tp);
+ nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+ nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
return stats;
}
@@ -3075,6 +3175,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
+ case TCP_FASTOPEN_KEY: {
+ __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct tcp_fastopen_context *ctx;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+ if (ctx)
+ memcpy(key, ctx->key, sizeof(key));
+ else
+ len = 0;
+ rcu_read_unlock();
+
+ len = min_t(unsigned int, len, sizeof(key));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, key, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -3137,6 +3259,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_connect;
break;
+ case TCP_FASTOPEN_NO_COOKIE:
+ val = tp->fastopen_no_cookie;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
@@ -3502,13 +3628,13 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
- sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+ init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+ init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 609965f0e298..fc3614377413 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -49,7 +49,6 @@ MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wma
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 epoch_start; /* beginning of an epoch */
@@ -72,7 +71,6 @@ static void bictcp_init(struct sock *sk)
struct bictcp *ca = inet_csk_ca(sk);
bictcp_reset(ca);
- ca->loss_cwnd = 0;
if (initial_ssthresh)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
@@ -172,22 +170,12 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
else
ca->last_max_cwnd = tp->snd_cwnd;
- ca->loss_cwnd = tp->snd_cwnd;
-
if (tp->snd_cwnd <= low_window)
return max(tp->snd_cwnd >> 1U, 2U);
else
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct bictcp *ca = inet_csk_ca(sk);
-
- return max(tp->snd_cwnd, ca->loss_cwnd);
-}
-
static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss)
@@ -214,7 +202,7 @@ static struct tcp_congestion_ops bictcp __read_mostly = {
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "bic",
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 50a0f3e51d5b..06fbe102a425 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -85,7 +85,6 @@ struct cdg {
u8 state;
u8 delack;
u32 rtt_seq;
- u32 undo_cwnd;
u32 shadow_wnd;
u16 backoff_cnt;
u16 sample_cnt;
@@ -330,8 +329,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- ca->undo_cwnd = tp->snd_cwnd;
-
if (ca->state == CDG_BACKOFF)
return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
@@ -344,13 +341,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
return max(2U, tp->snd_cwnd >> 1);
}
-static u32 tcp_cdg_undo_cwnd(struct sock *sk)
-{
- struct cdg *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd);
-}
-
static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
@@ -399,11 +389,11 @@ static void tcp_cdg_release(struct sock *sk)
kfree(ca->gradients);
}
-struct tcp_congestion_ops tcp_cdg __read_mostly = {
+static struct tcp_congestion_ops tcp_cdg __read_mostly = {
.cong_avoid = tcp_cdg_cong_avoid,
.cwnd_event = tcp_cdg_cwnd_event,
.pkts_acked = tcp_cdg_acked,
- .undo_cwnd = tcp_cdg_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.ssthresh = tcp_cdg_ssthresh,
.release = tcp_cdg_release,
.init = tcp_cdg_init,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index fde983f6376b..bc6c02f16243 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
}
/* Must be called with rcu lock held */
-static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
+ const char *name)
{
- const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+ struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key = TCP_CA_UNSPEC;
@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
might_sleep();
rcu_read_lock();
- ca = __tcp_ca_find_autoload(name);
+ ca = tcp_ca_find_autoload(net, name);
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
+ struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_congestion_ops *ca;
+ const struct tcp_congestion_ops *ca;
rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (likely(try_module_get(ca->owner))) {
- icsk->icsk_ca_ops = ca;
- goto out;
- }
- /* Fallback to next available. The last really
- * guaranteed fallback is Reno from this list.
- */
- }
-out:
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+ if (unlikely(!try_module_get(ca->owner)))
+ ca = &tcp_reno;
+ icsk->icsk_ca_ops = ca;
rcu_read_unlock();
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
INET_ECN_xmit(sk);
else
@@ -189,8 +186,8 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk);
}
-void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca)
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
}
/* Used by sysctl to change default congestion control */
-int tcp_set_default_congestion_control(const char *name)
+int tcp_set_default_congestion_control(struct net *net, const char *name)
{
struct tcp_congestion_ops *ca;
- int ret = -ENOENT;
-
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
-#ifdef CONFIG_MODULES
- if (!ca && capable(CAP_NET_ADMIN)) {
- spin_unlock(&tcp_cong_list_lock);
+ const struct tcp_congestion_ops *prev;
+ int ret;
- request_module("tcp_%s", name);
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
- }
-#endif
+ rcu_read_lock();
+ ca = tcp_ca_find_autoload(net, name);
+ if (!ca) {
+ ret = -ENOENT;
+ } else if (!try_module_get(ca->owner)) {
+ ret = -EBUSY;
+ } else {
+ prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+ if (prev)
+ module_put(prev->owner);
- if (ca) {
- ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
- list_move(&ca->list, &tcp_cong_list);
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
}
- spin_unlock(&tcp_cong_list_lock);
+ rcu_read_unlock();
return ret;
}
@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
- return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+ return tcp_set_default_congestion_control(&init_net,
+ CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);
@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
}
/* Get current default congestion control */
-void tcp_get_default_congestion_control(char *name)
+void tcp_get_default_congestion_control(struct net *net, char *name)
{
- struct tcp_congestion_ops *ca;
- /* We will always have reno... */
- BUG_ON(list_empty(&tcp_cong_list));
+ const struct tcp_congestion_ops *ca;
rcu_read_lock();
- ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
strncpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
@@ -338,7 +332,7 @@ out:
* tcp_reinit_congestion_control (if the current congestion control was
* already initialized.
*/
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -351,18 +345,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
if (!load)
ca = tcp_ca_find(name);
else
- ca = __tcp_ca_find_autoload(name);
+ ca = tcp_ca_find_autoload(sock_net(sk), name);
+
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) {
icsk->icsk_ca_setsockopt = 1;
goto out;
}
+
if (!ca) {
err = -ENOENT;
} else if (!load) {
- icsk->icsk_ca_ops = ca;
- if (!try_module_get(ca->owner))
+ const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
+
+ if (try_module_get(ca->owner)) {
+ if (reinit) {
+ tcp_reinit_congestion_control(sk, ca);
+ } else {
+ icsk->icsk_ca_ops = ca;
+ module_put(old_ca->owner);
+ }
+ } else {
err = -EBUSY;
+ }
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
err = -EPERM;
@@ -456,7 +461,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ return max(tp->snd_cwnd, tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 57ae5b5ae643..78bfadfcf342 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -83,7 +83,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 bic_origin_point;/* origin point of bic function */
@@ -142,7 +141,6 @@ static void bictcp_init(struct sock *sk)
struct bictcp *ca = inet_csk_ca(sk);
bictcp_reset(ca);
- ca->loss_cwnd = 0;
if (hystart)
bictcp_hystart_reset(sk);
@@ -366,18 +364,9 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
else
ca->last_max_cwnd = tp->snd_cwnd;
- ca->loss_cwnd = tp->snd_cwnd;
-
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct sock *sk)
-{
- struct bictcp *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss) {
@@ -470,7 +459,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a748c74aa8b7..abbf0edcf6c2 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -16,6 +16,7 @@
#include <linux/tcp.h>
+#include <net/netlink.h>
#include <net/tcp.h>
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -36,6 +37,100 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
tcp_get_info(sk, info);
}
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info,
+ const struct tcp_md5sig_key *key)
+{
+ info->tcpm_family = key->family;
+ info->tcpm_prefixlen = key->prefixlen;
+ info->tcpm_keylen = key->keylen;
+ memcpy(info->tcpm_key, key->key, key->keylen);
+
+ if (key->family == AF_INET)
+ info->tcpm_addr[0] = key->addr.a4.s_addr;
+ #if IS_ENABLED(CONFIG_IPV6)
+ else if (key->family == AF_INET6)
+ memcpy(&info->tcpm_addr, &key->addr.a6,
+ sizeof(info->tcpm_addr));
+ #endif
+}
+
+static int tcp_diag_put_md5sig(struct sk_buff *skb,
+ const struct tcp_md5sig_info *md5sig)
+{
+ const struct tcp_md5sig_key *key;
+ struct tcp_diag_md5sig *info;
+ struct nlattr *attr;
+ int md5sig_count = 0;
+
+ hlist_for_each_entry_rcu(key, &md5sig->head, node)
+ md5sig_count++;
+ if (md5sig_count == 0)
+ return 0;
+
+ attr = nla_reserve(skb, INET_DIAG_MD5SIG,
+ md5sig_count * sizeof(struct tcp_diag_md5sig));
+ if (!attr)
+ return -EMSGSIZE;
+
+ info = nla_data(attr);
+ memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig));
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ tcp_diag_md5sig_fill(info++, key);
+ if (--md5sig_count == 0)
+ break;
+ }
+
+ return 0;
+}
+#endif
+
+static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
+ struct sk_buff *skb)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ if (net_admin) {
+ struct tcp_md5sig_info *md5sig;
+ int err = 0;
+
+ rcu_read_lock();
+ md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+ if (md5sig)
+ err = tcp_diag_put_md5sig(skb, md5sig);
+ rcu_read_unlock();
+ if (err < 0)
+ return err;
+ }
+#endif
+
+ return 0;
+}
+
+static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
+{
+ size_t size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (net_admin && sk_fullsock(sk)) {
+ const struct tcp_md5sig_info *md5sig;
+ const struct tcp_md5sig_key *key;
+ size_t md5sig_count = 0;
+
+ rcu_read_lock();
+ md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+ if (md5sig) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node)
+ md5sig_count++;
+ }
+ rcu_read_unlock();
+ size += nla_total_size(md5sig_count *
+ sizeof(struct tcp_diag_md5sig));
+ }
+#endif
+
+ return size;
+}
+
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
@@ -68,13 +163,15 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler tcp_diag_handler = {
- .dump = tcp_diag_dump,
- .dump_one = tcp_diag_dump_one,
- .idiag_get_info = tcp_diag_get_info,
- .idiag_type = IPPROTO_TCP,
- .idiag_info_size = sizeof(struct tcp_info),
+ .dump = tcp_diag_dump,
+ .dump_one = tcp_diag_dump_one,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_get_aux = tcp_diag_get_aux,
+ .idiag_get_aux_size = tcp_diag_get_aux_size,
+ .idiag_type = IPPROTO_TCP,
+ .idiag_info_size = sizeof(struct tcp_info),
#ifdef CONFIG_INET_DIAG_DESTROY
- .destroy = tcp_diag_destroy,
+ .destroy = tcp_diag_destroy,
#endif
};
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fef200f..78c192ee03a4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/init.h>
@@ -9,15 +10,18 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
-
-struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-
-static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-
-void tcp_fastopen_init_key_once(bool publish)
+void tcp_fastopen_init_key_once(struct net *net)
{
- static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct tcp_fastopen_context *ctxt;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctxt) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
/* tcp_fastopen_reset_cipher publishes the new context
* atomically, so we allow this race happening here.
@@ -25,8 +29,8 @@ void tcp_fastopen_init_key_once(bool publish)
* All call sites of tcp_fastopen_cookie_gen also check
* for a valid cookie, so this is an acceptable risk.
*/
- if (net_get_random_once(key, sizeof(key)) && publish)
- tcp_fastopen_reset_cipher(key, sizeof(key));
+ get_random_bytes(key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -37,10 +41,37 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
kfree(ctx);
}
-int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+void tcp_fastopen_destroy_cipher(struct sock *sk)
+{
+ struct tcp_fastopen_context *ctx;
+
+ ctx = rcu_dereference_protected(
+ inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
+ if (ctx)
+ call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
+}
+
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+ struct tcp_fastopen_context *ctxt;
+
+ spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+ ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+ lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
+ spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+ if (ctxt)
+ call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+ void *key, unsigned int len)
{
- int err;
struct tcp_fastopen_context *ctx, *octx;
+ struct fastopen_queue *q;
+ int err;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -61,26 +92,37 @@ error: kfree(ctx);
}
memcpy(ctx->key, key, len);
- spin_lock(&tcp_fastopen_ctx_lock);
- octx = rcu_dereference_protected(tcp_fastopen_ctx,
- lockdep_is_held(&tcp_fastopen_ctx_lock));
- rcu_assign_pointer(tcp_fastopen_ctx, ctx);
- spin_unlock(&tcp_fastopen_ctx_lock);
+ spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+ if (sk) {
+ q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ octx = rcu_dereference_protected(q->ctx,
+ lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(q->ctx, ctx);
+ } else {
+ octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+ lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+ }
+ spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
return err;
}
-static bool __tcp_fastopen_cookie_gen(const void *path,
+static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
struct tcp_fastopen_cookie *foc)
{
struct tcp_fastopen_context *ctx;
bool ok = false;
rcu_read_lock();
- ctx = rcu_dereference(tcp_fastopen_ctx);
+
+ ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
+ if (!ctx)
+ ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+
if (ctx) {
crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -96,7 +138,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path,
*
* XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
*/
-static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+static bool tcp_fastopen_cookie_gen(struct sock *sk,
+ struct request_sock *req,
struct sk_buff *syn,
struct tcp_fastopen_cookie *foc)
{
@@ -104,7 +147,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
const struct iphdr *iph = ip_hdr(syn);
__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
- return __tcp_fastopen_cookie_gen(path, foc);
+ return __tcp_fastopen_cookie_gen(sk, path, foc);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -112,13 +155,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
struct tcp_fastopen_cookie tmp;
- if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
struct in6_addr *buf = &tmp.addr;
int i;
for (i = 0; i < 4; i++)
buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
- return __tcp_fastopen_cookie_gen(buf, foc);
+ return __tcp_fastopen_cookie_gen(sk, buf, foc);
}
}
#endif
@@ -171,7 +214,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
- struct dst_entry *dst,
struct request_sock *req)
{
struct tcp_sock *tp;
@@ -217,12 +259,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
- inet_csk(child)->icsk_af_ops->rebuild_header(child);
- tcp_init_congestion_control(child);
- tcp_mtup_init(child);
- tcp_init_metrics(child);
- tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tcp_init_buffer_space(child);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@@ -272,6 +309,15 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
return true;
}
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+ const struct dst_entry *dst,
+ int flag)
+{
+ return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
+ tcp_sk(sk)->fastopen_no_cookie ||
+ (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
* cookie request (foc->len == 0).
@@ -279,27 +325,29 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+ const struct dst_entry *dst)
{
- struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
- if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
return NULL;
}
- if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ if (syn_data &&
+ tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
if (foc->len >= 0 && /* Client presents or requests a cookie */
- tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
foc->len == valid_foc.len &&
!memcmp(foc->val, valid_foc.val, foc->len)) {
@@ -312,7 +360,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
* data in SYN_RECV state.
*/
fastopen:
- child = tcp_fastopen_create_child(sk, skb, dst, req);
+ child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
foc->len = -1;
NET_INC_STATS(sock_net(sk),
@@ -332,6 +380,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
unsigned long last_syn_loss = 0;
+ const struct dst_entry *dst;
int syn_loss = 0;
tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
@@ -349,7 +398,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
return false;
}
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ dst = __sk_dst_get(sk);
+
+ if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
cookie->len = -1;
return true;
}
@@ -403,25 +454,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
* TFO connection with data exchanges.
*/
-/* Default to 1hr */
-unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
-static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
-static unsigned long tfo_active_disable_stamp __read_mostly;
-
/* Disable active TFO and record current jiffies and
* tfo_active_disable_times
*/
void tcp_fastopen_active_disable(struct sock *sk)
{
- atomic_inc(&tfo_active_disable_times);
- tfo_active_disable_stamp = jiffies;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
-}
+ struct net *net = sock_net(sk);
-/* Reset tfo_active_disable_times to 0 */
-void tcp_fastopen_active_timeout_reset(void)
-{
- atomic_set(&tfo_active_disable_times, 0);
+ atomic_inc(&net->ipv4.tfo_active_disable_times);
+ net->ipv4.tfo_active_disable_stamp = jiffies;
+ NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
}
/* Calculate timeout for tfo active disable
@@ -430,17 +472,18 @@ void tcp_fastopen_active_timeout_reset(void)
*/
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
- int tfo_da_times = atomic_read(&tfo_active_disable_times);
- int multiplier;
+ unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
+ int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
unsigned long timeout;
+ int multiplier;
if (!tfo_da_times)
return false;
/* Limit timout to max: 2^6 * initial timeout */
multiplier = 1 << min(tfo_da_times - 1, 6);
- timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
- if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+ timeout = multiplier * tfo_bh_timeout * HZ;
+ if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
return true;
/* Mark check bit so we can check for successful active TFO
@@ -459,27 +502,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct rb_node *p;
- struct sk_buff *skb;
struct dst_entry *dst;
+ struct sk_buff *skb;
if (!tp->syn_fastopen)
return;
if (!tp->data_segs_in) {
- p = rb_first(&tp->out_of_order_queue);
- if (p && !rb_next(p)) {
- skb = rb_entry(p, struct sk_buff, rbnode);
+ skb = skb_rb_first(&tp->out_of_order_queue);
+ if (skb && !skb_rb_next(skb)) {
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
tcp_fastopen_active_disable(sk);
return;
}
}
} else if (tp->syn_fastopen_ch &&
- atomic_read(&tfo_active_disable_times)) {
+ atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
dst = sk_dst_get(sk);
if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
- tcp_fastopen_active_timeout_reset();
+ atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
dst_release(dst);
}
}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6d9879e93648..d1c33c91eadc 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,7 +94,6 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
- u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -153,22 +152,14 @@ static u32 hstcp_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
-static u32 hstcp_cwnd_undo(struct sock *sk)
-{
- const struct hstcp *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
- .undo_cwnd = hstcp_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 3eb78cde6ff0..082d479462fa 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -66,7 +66,6 @@ static inline void htcp_reset(struct htcp *ca)
static u32 htcp_cwnd_undo(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
if (ca->undo_last_cong) {
@@ -76,7 +75,7 @@ static u32 htcp_cwnd_undo(struct sock *sk)
ca->undo_last_cong = 0;
}
- return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+ return tcp_reno_undo_cwnd(sk);
}
static inline void measure_rtt(struct sock *sk, u32 srtt)
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 60352ff4f5a8..7c843578f233 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,7 +48,6 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
- u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -297,18 +296,10 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
-static u32 tcp_illinois_cwnd_undo(struct sock *sk)
-{
- const struct illinois *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -336,7 +327,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
- .undo_cwnd = tcp_illinois_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2920e0cb09f8..734cfc8ff76e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -75,25 +76,10 @@
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
+#include <trace/events/tcp.h>
+#include <linux/static_key.h>
-int sysctl_tcp_fack __read_mostly;
-int sysctl_tcp_max_reordering __read_mostly = 300;
-int sysctl_tcp_dsack __read_mostly = 1;
-int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-
-/* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 1000;
-
-int sysctl_tcp_stdurg __read_mostly;
-int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly = 2;
-int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_early_retrans __read_mostly = 3;
-int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -107,13 +93,14 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
@@ -333,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+ sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -366,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
- int truesize = tcp_win_from_space(skb->truesize) >> 1;
- int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
+ int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
+ int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -392,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
- if (tcp_win_from_space(skb->truesize) <= skb->len)
+ if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
incr = 2 * tp->advmss;
else
incr = __tcp_grow_window(sk, skb);
@@ -418,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
* Allow enough cushion so that sender is not limited by our window
*/
- if (sysctl_tcp_moderate_rcvbuf)
+ if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
rcvmem <<= 2;
if (sk->sk_rcvbuf < rcvmem)
- sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+ sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
}
/* 4. Try to fixup all. It is made immediately after connection enters
@@ -430,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
*/
void tcp_init_buffer_space(struct sock *sk)
{
+ int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
@@ -448,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk)
if (tp->window_clamp >= maxwin) {
tp->window_clamp = maxwin;
- if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+ if (tcp_app_win && maxwin > 4 * tp->advmss)
tp->window_clamp = max(maxwin -
- (maxwin >> sysctl_tcp_app_win),
+ (maxwin >> tcp_app_win),
4 * tp->advmss);
}
/* Force reservation of one segment. */
- if (sysctl_tcp_app_win &&
+ if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
@@ -469,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
icsk->icsk_ack.quick = 0;
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ net->ipv4.sysctl_tcp_rmem[2]);
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -608,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
- if (sysctl_tcp_moderate_rcvbuf &&
+ if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvwin, rcvmem, rcvbuf;
@@ -632,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk)
}
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(rcvmem) < tp->advmss)
+ while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
@@ -779,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->srtt_us = max(1U, srtt);
}
-/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
- * Note: TCP stack does not yet implement pacing.
- * FQ packet scheduler can be used to implement cheap but effective
- * TCP pacing, to smooth the burst on large writes when packets
- * in flight is significantly lower than cwnd (or rwin)
- */
-int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
-int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
-
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -805,21 +786,21 @@ static void tcp_update_pacing_rate(struct sock *sk)
* end of slow start and should slow down.
*/
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
- rate *= sysctl_tcp_pacing_ss_ratio;
+ rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
else
- rate *= sysctl_tcp_pacing_ca_ratio;
+ rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
- /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
- ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
- sk->sk_max_pacing_rate);
+ WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
+ sk->sk_max_pacing_rate));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -861,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/*
- * Packet counting of FACK is based on in-order assumptions, therefore TCP
- * disables it when reordering is detected
- */
-void tcp_disable_fack(struct tcp_sock *tp)
-{
- /* RFC3517 uses different metric in lost marker => reset on change */
- if (tcp_is_fack(tp))
- tp->lost_skb_hint = NULL;
- tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
-}
-
/* Take a notice that peer is sending D-SACKs */
static void tcp_dsack_seen(struct tcp_sock *tp)
{
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+ tp->rack.dsack_seen = 1;
}
-static void tcp_update_reordering(struct sock *sk, const int metric,
- const int ts)
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+ const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mib_idx;
+ const u32 mss = tp->mss_cache;
+ u32 fack, metric;
- if (WARN_ON_ONCE(metric < 0))
+ fack = tcp_highest_sack_seq(tp);
+ if (!before(low_seq, fack))
return;
- if (metric > tp->reordering) {
- tp->reordering = min(sysctl_tcp_max_reordering, metric);
-
+ metric = fack - low_seq;
+ if ((metric > tp->reordering * mss) && mss) {
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
- tp->fackets_out,
+ 0,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
- tcp_disable_fack(tp);
+ tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+ sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
tp->rack.reord = 1;
-
/* This exciting event is worth to be remembered. 8) */
- if (ts)
- mib_idx = LINUX_MIB_TCPTSREORDER;
- else if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENOREORDER;
- else if (tcp_is_fack(tp))
- mib_idx = LINUX_MIB_TCPFACKREORDER;
- else
- mib_idx = LINUX_MIB_TCPSACKREORDER;
-
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk),
+ ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
/* This must be called before lost_out is incremented */
@@ -988,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
* 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
- * A''. Its FACK modification, head until snd.fack is lost.
* B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1131,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}
struct tcp_sacktag_state {
- int reord;
- int fack_count;
+ u32 reord;
/* Timestamps for earliest and latest never-retransmitted segment
* that was SACKed. RTO needs the earliest RTT to stay conservative,
* but congestion control should still get an accurate delay signal.
@@ -1141,6 +1106,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
struct rate_sample *rate;
int flag;
+ unsigned int mss_now;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1190,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
if (pkt_len >= skb->len && !in_sack)
return 0;
- err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+ err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
}
@@ -1206,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
- int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
tp->undo_retrans--;
- if (sacked & TCPCB_SACKED_ACKED)
- state->reord = min(fack_count, state->reord);
+ if ((sacked & TCPCB_SACKED_ACKED) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1240,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
* which was in hole. It is reordering.
*/
if (before(start_seq,
- tcp_highest_sack_seq(tp)))
- state->reord = min(fack_count,
- state->reord);
+ tcp_highest_sack_seq(tp)) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
+
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
if (state->first_sackt == 0)
@@ -1261,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
tp->delivered += pcount; /* Out-of-order packets delivered */
- fack_count += pcount;
-
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+ if (tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
-
- if (fack_count > tp->fackets_out)
- tp->fackets_out = fack_count;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1287,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
@@ -1362,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
- tcp_unlink_write_queue(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
@@ -1413,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto fallback;
/* Can only happen with delayed DSACK + discard craziness */
- if (unlikely(skb == tcp_write_queue_head(sk)))
+ prev = skb_rb_prev(skb);
+ if (!prev)
goto fallback;
- prev = tcp_write_queue_prev(sk, skb);
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
@@ -1494,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!skb_shift(prev, skb, len))
goto fallback;
- if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;
/* Hole filled allows collapsing with the next as well, this is very
* useful when hole on every nth skb pattern happens
*/
- if (prev == tcp_write_queue_tail(sk))
+ skb = skb_rb_next(prev);
+ if (!skb)
goto out;
- skb = tcp_write_queue_next(sk, prev);
if (!skb_can_shift(skb) ||
- (skb == tcp_send_head(sk)) ||
((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
(mss != tcp_skb_seglen(skb)))
goto out;
@@ -1513,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+ tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+ len, mss, 0);
}
out:
- state->fack_count += pcount;
return prev;
noop:
@@ -1537,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
- tcp_for_write_queue_from(skb, sk) {
+ skb_rbtree_walk_from(skb) {
int in_sack = 0;
bool dup_sack = dup_sack_in;
- if (skb == tcp_send_head(sk))
- break;
-
/* queue is in-order => we can short-circuit the walk early */
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
@@ -1592,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
tcp_skb_pcount(skb),
skb->skb_mstamp);
tcp_rate_skb_delivered(sk, skb, state->rate);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ list_del_init(&skb->tcp_tsorted_anchor);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
tcp_advance_highest_sack(sk, skb);
}
-
- state->fack_count += tcp_skb_pcount(skb);
}
return skb;
}
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+ struct tcp_sacktag_state *state,
+ u32 seq)
+{
+ struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+ struct sk_buff *skb;
+
+ while (*p) {
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (before(seq, TCP_SKB_CB(skb)->seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+ p = &parent->rb_right;
+ continue;
+ }
+ return skb;
+ }
+ return NULL;
+}
+
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
- if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
- break;
+ if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+ return skb;
- state->fack_count += tcp_skb_pcount(skb);
- }
- return skb;
+ return tcp_sacktag_bsearch(sk, state, skip_to_seq);
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1664,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int first_sack_index;
state->flag = 0;
- state->reord = tp->packets_out;
+ state->reord = tp->snd_nxt;
- if (!tp->sacked_out) {
- if (WARN_ON(tp->fackets_out))
- tp->fackets_out = 0;
+ if (!tp->sacked_out)
tcp_highest_sack_reset(sk);
- }
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
@@ -1741,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
}
- skb = tcp_write_queue_head(sk);
- state->fack_count = 0;
+ state->mss_now = tcp_current_mss(sk);
+ skb = NULL;
i = 0;
if (!tp->sacked_out) {
@@ -1799,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
@@ -1814,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state->fack_count = tp->fackets_out;
}
skb = tcp_sacktag_skip(skb, sk, state, start_seq);
@@ -1834,9 +1801,8 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- if ((state->reord < tp->fackets_out) &&
- ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+ tcp_check_sack_reordering(sk, state->reord, 0);
tcp_verify_left_out(tp);
out:
@@ -1874,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_limit_reno_sacked(tp))
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
+
+ if (!tcp_limit_reno_sacked(tp))
+ return;
+
+ tp->reordering = min_t(u32, tp->packets_out + addend,
+ sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1921,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = -1;
- tp->fackets_out = 0;
tp->sacked_out = 0;
}
@@ -1951,6 +1921,7 @@ void tcp_enter_loss(struct sock *sk)
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
@@ -1965,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk)
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
- tp->fackets_out = 0;
}
tcp_clear_all_retrans_hints(tp);
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
+ skb_rbtree_walk_from(skb) {
mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
is_reneg);
if (mark_lost)
@@ -2011,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk)
* falsely raise the receive window, which results in repeated
* timeouts and stop-and-go behavior.
*/
- tp->frto = sysctl_tcp_frto &&
+ tp->frto = net->ipv4.sysctl_tcp_frto &&
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
}
@@ -2040,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
return false;
}
-static inline int tcp_fackets_out(const struct tcp_sock *tp)
-{
- return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
-}
-
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
* counter when SACK is enabled (without SACK, sacked_out is used for
* that purpose).
*
- * Instead, with FACK TCP uses fackets_out that includes both SACKed
- * segments up to the highest received SACK block so far and holes in
- * between them.
- *
* With reordering, holes may still be in flight, so RFC3517 recovery
* uses pure sacked_out (total number of SACKed segments) even though
* it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2062,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
*/
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
- return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+ return tp->sacked_out + 1;
}
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
@@ -2130,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* dynamically measured and adjusted. This is implemented in
* tcp_rack_mark_lost.
*
- * FACK (Disabled by default. Subsumbed by RACK):
- * It is the simplest heuristics. As soon as we decided
- * that something is lost, we decide that _all_ not SACKed
- * packets until the most forward SACK are lost. I.e.
- * lost_out = fackets_out - sacked_out and left_out = fackets_out.
- * It is absolutely correct estimate, if network does not reorder
- * packets. And it loses any connection to reality when reordering
- * takes place. We use FACK by default until reordering
- * is suspected on the path to this destination.
- *
* If the receiver does not support SACK:
*
* NewReno (RFC6582): in Recovery we assume that one segment
@@ -2188,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
}
/* Detect loss in event "A" above by marking head of queue up as lost.
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * For non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
@@ -2203,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
WARN_ON(packets > tp->packets_out);
- if (tp->lost_skb_hint) {
- skb = tp->lost_skb_hint;
- cnt = tp->lost_cnt_hint;
+ skb = tp->lost_skb_hint;
+ if (skb) {
/* Head already handled? */
- if (mark_head && skb != tcp_write_queue_head(sk))
+ if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
return;
+ cnt = tp->lost_cnt_hint;
} else {
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
cnt = 0;
}
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk_from(skb) {
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
@@ -2226,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
oldcnt = cnt;
- if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+ if (tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ if (tcp_is_sack(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
@@ -2240,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
/* If needed, chop off the prefix to mark as lost. */
lost = (packets - oldcnt) * mss;
if (lost < skb->len &&
- tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+ tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ lost, mss, GFP_ATOMIC) < 0)
break;
cnt = packets;
}
@@ -2261,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
if (tcp_is_reno(tp)) {
tcp_mark_head_lost(sk, 1, 1);
- } else if (tcp_is_fack(tp)) {
- int lost = tp->fackets_out - tp->reordering;
- if (lost <= 0)
- lost = 1;
- tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
@@ -2324,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk)
if (tp->retrans_out)
return true;
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
return true;
return false;
}
-#if FASTRETRANS_DEBUG > 1
static void DBGUNDO(struct sock *sk, const char *msg)
{
+#if FASTRETRANS_DEBUG > 1
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
@@ -2355,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg)
tp->packets_out);
}
#endif
-}
-#else
-#define DBGUNDO(x...) do { } while (0)
#endif
+}
static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
@@ -2367,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (unmark_loss) {
struct sk_buff *skb;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
tp->lost_out = 0;
@@ -2414,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS(sock_net(sk), mib_idx);
+ } else if (tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_persist--;
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
@@ -2433,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) {
+ tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+ tp->rack.reo_wnd_persist + 1);
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -2520,8 +2462,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
return;
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
- (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
+ (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_jiffies32;
}
@@ -2612,11 +2554,8 @@ void tcp_simple_retransmit(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk);
- u32 prior_lost = tp->lost_out;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss &&
!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2629,7 +2568,7 @@ void tcp_simple_retransmit(struct sock *sk)
tcp_clear_retrans_hints_partial(tp);
- if (prior_lost == tp->lost_out)
+ if (!tp->lost_out)
return;
if (tcp_is_reno(tp))
@@ -2710,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
* is updated in tcp_ack()). Otherwise fall back to
* the conventional recovery.
*/
- if (tcp_send_head(sk) &&
+ if (!tcp_write_queue_empty(sk) &&
after(tcp_wnd_end(tp), tp->snd_nxt)) {
*rexmit = REXMIT_NEW;
return;
@@ -2737,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
}
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && tcp_packet_delayed(tp)) {
/* Plain luck! Hole if filled with delayed
- * packet, rather than with a retransmit.
+ * packet, rather than with a retransmit. Check reordering.
*/
- tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+ tcp_check_sack_reordering(sk, prior_snd_una, 1);
/* We are getting evidence that the reordering degree is higher
* than we realized. If there are no retransmits out then we
@@ -2772,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
struct tcp_sock *tp = tcp_sk(sk);
/* Use RACK to detect loss */
- if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
u32 prior_retrans = tp->retrans_out;
tcp_rack_mark_lost(sk);
@@ -2781,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
}
}
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return after(tcp_highest_sack_seq(tp),
+ tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2793,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- (tcp_fackets_out(tp) > tp->reordering));
+ tcp_force_fast_retransmit(sk));
- if (WARN_ON(!tp->packets_out && tp->sacked_out))
+ if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
- if (WARN_ON(!tp->sacked_out && tp->fackets_out))
- tp->fackets_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2852,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked))
+ if (tcp_try_undo_partial(sk, prior_snd_una))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
- tcp_fackets_out(tp) > tp->reordering;
+ tcp_force_fast_retransmit(sk);
}
if (tcp_try_undo_dsack(sk)) {
tcp_try_keep_open(sk);
@@ -2871,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
+ /* fall through */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2911,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
+ u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk);
- u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1));
@@ -3004,21 +2950,24 @@ void tcp_rearm_rto(struct sock *sk)
/* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
- struct sk_buff *skb = tcp_write_queue_head(sk);
- u64 rto_time_stamp = skb->skb_mstamp +
- jiffies_to_usecs(rto);
- s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
+ s64 delta_us = tcp_rto_delta_us(sk);
/* delta_us may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
- if (delta_us > 0)
- rto = usecs_to_jiffies(delta_us);
+ rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
}
}
+/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
+static void tcp_set_xmit_timer(struct sock *sk)
+{
+ if (!tcp_schedule_loss_probe(sk, true))
+ tcp_rearm_rto(sk);
+}
+
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3051,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
shinfo = skb_shinfo(skb);
if (!before(shinfo->tskey, prior_snd_una) &&
- before(shinfo->tskey, tcp_sk(sk)->snd_una))
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+ tcp_skb_tsorted_save(skb) {
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ } tcp_skb_tsorted_restore(skb);
+ }
}
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, int *acked,
+static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+ u32 prior_snd_una,
struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u64 first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
- u32 reord = tp->packets_out;
+ u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
+ struct sk_buff *skb, *next;
bool fully_acked = true;
long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
long ca_rtt_us = -1L;
- struct sk_buff *skb;
u32 pkts_acked = 0;
u32 last_in_flight = 0;
bool rtt_update;
@@ -3080,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt = 0;
- while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ const u32 start_seq = scb->seq;
u8 sacked = scb->sacked;
u32 acked_pcount;
@@ -3098,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
break;
fully_acked = false;
} else {
- /* Speedup tcp_unlink_write_queue() and next loop */
- prefetchw(skb->next);
acked_pcount = tcp_skb_pcount(skb);
}
@@ -3114,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt = last_ackt;
last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
- reord = min(pkts_acked, reord);
+ if (before(start_seq, reord))
+ reord = start_seq;
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
}
@@ -3151,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!fully_acked)
break;
- tcp_unlink_write_queue(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
+ tcp_rtx_queue_unlink_and_free(skb, sk);
}
if (!skb)
@@ -3180,7 +3132,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) {
- tcp_rearm_rto(sk);
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3192,23 +3144,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets && reord <= tp->fackets_out)
- tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+ if (before(reord, prior_fack))
+ tcp_check_sack_reordering(sk, reord, 0);
- delta = tcp_is_fack(tp) ? pkts_acked :
- prior_sacked - tp->sacked_out;
+ delta = prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
-
- tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
*/
- tcp_rearm_rto(sk);
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
}
if (icsk->icsk_ca_ops->pkts_acked) {
@@ -3242,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
}
#endif
- *acked = pkts_acked;
return flag;
}
static void tcp_ack_probe(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *head = tcp_send_head(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Was it a usable window open? */
-
- if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+ if (!head)
+ return;
+ if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3373,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->pred_flags = 0;
tcp_fast_path_check(sk);
- if (tcp_send_head(sk))
+ if (!tcp_write_queue_empty(sk))
tcp_slow_start_after_idle_check(sk);
if (nwin > tp->max_window) {
@@ -3394,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
- if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
@@ -3430,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static u32 challenge_timestamp;
static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
u32 count, now;
/* First check our per-socket dupack rate limit. */
- if (__tcp_oow_rate_limited(sock_net(sk),
+ if (__tcp_oow_rate_limited(net,
LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
&tp->last_oow_ack_time))
return;
@@ -3441,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
- u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+ u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
+ u32 half = (ack_limit + 1) >> 1;
challenge_timestamp = now;
- WRITE_ONCE(challenge_count, half +
- prandom_u32_max(sysctl_tcp_challenge_ack_limit));
+ WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
}
count = READ_ONCE(challenge_count);
if (count > 0) {
WRITE_ONCE(challenge_count, count - 1);
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
@@ -3548,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_fackets;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
- int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
- /* We very likely will need to access write queue head. */
- prefetchw(sk->sk_write_queue.next);
+ /* We very likely will need to access rtx queue. */
+ prefetch(sk->tcp_rtx_queue.rb_node);
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3580,15 +3529,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
- tcp_rearm_rto(sk);
-
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
}
- prior_fackets = tp->fackets_out;
+ prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
@@ -3644,21 +3590,25 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
+
+ tcp_rack_update_reo_wnd(sk, &rs);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+ /* If needed, reset TLP/RTO timer; RACK may later override this. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
}
- if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS)
- tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, sack_state.rate);
@@ -3669,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
- if (tcp_send_head(sk))
- tcp_ack_probe(sk);
+ tcp_ack_probe(sk);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -3692,7 +3642,8 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
tcp_xmit_recovery(sk, rexmit);
}
@@ -3717,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
+static void smc_parse_options(const struct tcphdr *th,
+ struct tcp_options_received *opt_rx,
+ const unsigned char *ptr,
+ int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (th->syn && !(opsize & 1) &&
+ opsize >= TCPOLEN_EXP_SMC_BASE &&
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ opt_rx->smc_ok = 1;
+ }
+#endif
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3824,6 +3790,9 @@ void tcp_parse_options(const struct net *net,
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
+ else
+ smc_parse_options(th, opt_rx, ptr,
+ opsize);
break;
}
@@ -3991,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk)
{
+ trace_tcp_receive_reset(sk);
+
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
@@ -4113,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
int mib_idx;
if (before(seq, tp->rcv_nxt))
@@ -4148,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
- if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
@@ -4267,6 +4238,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
/**
* tcp_try_coalesce - try to merge skb to prior one
* @sk: socket
+ * @dest: destination queue
* @to: prior buffer
* @from: buffer to add in queue
* @fragstolen: pointer to boolean
@@ -4299,6 +4271,12 @@ static bool tcp_try_coalesce(struct sock *sk,
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+
+ if (TCP_SKB_CB(from)->has_rxtstamp) {
+ TCP_SKB_CB(to)->has_rxtstamp = true;
+ to->tstamp = from->tstamp;
+ }
+
return true;
}
@@ -4321,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk)
p = rb_first(&tp->out_of_order_queue);
while (p) {
- skb = rb_entry(p, struct sk_buff, rbnode);
+ skb = rb_to_skb(p);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
@@ -4385,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct rb_node **p, *q, *parent;
+ struct rb_node **p, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
bool fragstolen;
@@ -4425,7 +4403,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
/* In the typical case, we are adding an skb to the end of the list.
* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
*/
- if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+ if (tcp_try_coalesce(sk, tp->ooo_last_skb,
+ skb, &fragstolen)) {
coalesce_done:
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
@@ -4443,7 +4422,7 @@ coalesce_done:
parent = NULL;
while (*p) {
parent = *p;
- skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ skb1 = rb_to_skb(parent);
if (before(seq, TCP_SKB_CB(skb1)->seq)) {
p = &parent->rb_left;
continue;
@@ -4475,7 +4454,8 @@ coalesce_done:
__kfree_skb(skb1);
goto merge_right;
}
- } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ } else if (tcp_try_coalesce(sk, skb1,
+ skb, &fragstolen)) {
goto coalesce_done;
}
p = &parent->rb_right;
@@ -4487,9 +4467,7 @@ insert:
merge_right:
/* Remove other segments covered by skb. */
- while ((q = rb_next(&skb->rbnode)) != NULL) {
- skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4504,7 +4482,7 @@ merge_right:
tcp_drop(sk, skb1);
}
/* If there is no skb after us, we are the last_skb ! */
- if (!q)
+ if (!skb1)
tp->ooo_last_skb = skb;
add_sack:
@@ -4526,7 +4504,8 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
__skb_pull(skb, hdrlen);
eaten = (tail &&
- tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_try_coalesce(sk, tail,
+ skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -4588,8 +4567,8 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool fragstolen = false;
- int eaten = -1;
+ bool fragstolen;
+ int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
@@ -4611,32 +4590,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto out_of_window;
/* Ok. In sequence. In window. */
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
- sock_owned_by_user(sk) && !tp->urg_data) {
- int chunk = min_t(unsigned int, skb->len,
- tp->ucopy.len);
-
- __set_current_state(TASK_RUNNING);
-
- if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- eaten = (chunk == skb->len);
- tcp_rcv_space_adjust(sk);
- }
- }
-
- if (eaten <= 0) {
queue_and_out:
- if (eaten < 0) {
- if (skb_queue_len(&sk->sk_receive_queue) == 0)
- sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
- }
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- }
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
@@ -4708,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
if (list)
return !skb_queue_is_last(list, skb) ? skb->next : NULL;
- return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+ return skb_rb_next(skb);
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4729,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
}
/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
@@ -4737,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
while (*p) {
parent = *p;
- skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ skb1 = rb_to_skb(parent);
if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
p = &parent->rb_left;
else
@@ -4784,7 +4744,7 @@ restart:
* overlaps to the next one.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
- (tcp_win_from_space(skb->truesize) > skb->len ||
+ (tcp_win_from_space(sk, skb->truesize) > skb->len ||
before(TCP_SKB_CB(skb)->seq, start))) {
end_of_skbs = false;
break;
@@ -4856,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *head;
- struct rb_node *p;
u32 start, end;
- p = rb_first(&tp->out_of_order_queue);
- skb = rb_entry_safe(p, struct sk_buff, rbnode);
+ skb = skb_rb_first(&tp->out_of_order_queue);
new_range:
if (!skb) {
- p = rb_last(&tp->out_of_order_queue);
- /* Note: This is possible p is NULL here. We do not
- * use rb_entry_safe(), as ooo_last_skb is valid only
- * if rbtree is not empty.
- */
- tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+ tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
return;
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
for (head = skb;;) {
- skb = tcp_skb_next(skb, NULL);
+ skb = skb_rb_next(skb);
/* Range is terminated when we see a gap or when
* we are at the queue end.
@@ -4918,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
- tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+ tcp_drop(sk, rb_to_skb(node));
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
node = prev;
} while (node);
- tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+ tp->ooo_last_skb = rb_to_skb(prev);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
@@ -5100,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
- if (ptr && !sysctl_tcp_stdurg)
+ if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
ptr--;
ptr += ntohl(th->seq);
@@ -5186,26 +5139,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
}
}
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int err;
-
- if (skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
- else
- err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
-
- if (!err) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
- }
-
- return err;
-}
-
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5358,8 +5291,9 @@ discard:
* tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
+ unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
tcp_mstamp_refresh(tp);
@@ -5445,56 +5379,28 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
int eaten = 0;
bool fragstolen = false;
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len &&
- sock_owned_by_user(sk)) {
- __set_current_state(TASK_RUNNING);
-
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) +
- TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- __skb_pull(skb, tcp_header_len);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHPHITSTOUSER);
- eaten = 1;
- }
- }
- if (!eaten) {
- if (tcp_checksum_complete(skb))
- goto csum_error;
+ if (tcp_checksum_complete(skb))
+ goto csum_error;
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(sk, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
- /* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
- }
+ /* Bulk data transfer: receiver */
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -5567,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
security_inet_conn_established(sk, skb);
}
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
- tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
- tcp_init_congestion_control(sk);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
tp->lsndtime = tcp_jiffies32;
- tcp_init_buffer_space(sk);
-
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
@@ -5588,14 +5487,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;
-
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
@@ -5630,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
- tcp_for_write_queue_from(data, sk) {
- if (data == tcp_send_head(sk) ||
- __tcp_retransmit_skb(sk, data, 1))
+ skb_rbtree_walk_from(data) {
+ if (__tcp_retransmit_skb(sk, data, 1))
break;
}
tcp_rearm_rto(sk);
@@ -5650,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return false;
}
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (tp->syn_smc && !tp->rx_opt.smc_ok)
+ tp->syn_smc = 0;
+ }
+#endif
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
@@ -5745,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tp->tcp_header_len = sizeof(struct tcphdr);
}
- if (tcp_is_sack(tp) && sysctl_tcp_fack)
- tcp_enable_fack(tp);
-
- tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -5757,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
+ smc_check_reset_syn(tp);
+
smp_mb();
tcp_finish_connect(sk, skb);
@@ -5974,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (req) {
inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
} else {
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
- tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tcp_init_congestion_control(sk);
-
- tcp_mtup_init(sk);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tp->copied_seq = tp->rcv_nxt;
- tcp_init_buffer_space(sk);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -6002,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
- if (req) {
- /* Re-arm the timer because data may have been sent out.
- * This is similar to the regular data transmission case
- * when new data has just been ack'ed.
- *
- * (TFO) - we could try to be more aggressive and
- * retransmitting any data sooner based on when they
- * are sent out.
- */
- tcp_rearm_rto(sk);
- } else
- tcp_init_metrics(sk);
-
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
@@ -6111,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
+ /* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
@@ -6219,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+ ireq->smc_ok = rx_opt->smc_ok;
+#endif
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
@@ -6231,8 +6130,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
- kmemcheck_annotate_bitfield(ireq, flags);
- ireq->opt = NULL;
+ ireq->ireq_opt = NULL;
#if IS_ENABLED(CONFIG_IPV6)
ireq->pktopts = NULL;
#endif
@@ -6303,9 +6201,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
- struct dst_entry *dst = NULL;
struct request_sock *req;
bool want_cookie = false;
+ struct dst_entry *dst;
struct flowi fl;
/* TW buckets are converted to open requests without
@@ -6355,6 +6253,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+ dst = af_ops->route_req(sk, &fl, req);
+ if (!dst)
+ goto drop_and_free;
+
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
@@ -6375,11 +6277,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
isn = af_ops->init_seq(skb);
}
- if (!dst) {
- dst = af_ops->route_req(sk, &fl, req);
- if (!dst)
- goto drop_and_free;
- }
tcp_ecn_create_request(req, skb, sk, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a20e7f03d5f7..c6bc0c4d19c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,7 +85,7 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_low_latency __read_mostly;
+#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
@@ -385,7 +385,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
th->dest, iph->saddr, ntohs(th->source),
- inet_iif(icmp_skb));
+ inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
@@ -482,7 +482,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
TCP_TIMEOUT_INIT;
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
BUG_ON(!skb);
tcp_mstamp_refresh(tp);
@@ -661,7 +661,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb));
+ ntohs(th->source), inet_iif(skb),
+ tcp_v4_sdif(skb));
/* don't send rst if it can't find key */
if (!sk1)
goto out;
@@ -702,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
*/
- if (sk)
+ if (sk) {
arg.bound_dev_if = sk->sk_bound_dev_if;
+ trace_tcp_send_reset(sk, skb);
+ }
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
@@ -878,7 +881,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- ireq->opt);
+ ireq_opt_deref(ireq));
err = net_xmit_eval(err);
}
@@ -890,7 +893,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
*/
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
- kfree(inet_rsk(req)->opt);
+ kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}
#ifdef CONFIG_TCP_MD5SIG
@@ -1266,10 +1269,11 @@ static void tcp_v4_init_req(struct request_sock *req,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = sock_net(sk_listener);
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->opt = tcp_v4_save_options(skb);
+ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@@ -1356,10 +1360,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
sk_daddr_set(newsk, ireq->ir_rmt_addr);
sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
newsk->sk_bound_dev_if = ireq->ir_iif;
- newinet->inet_saddr = ireq->ir_loc_addr;
- inet_opt = ireq->opt;
- rcu_assign_pointer(newinet->inet_opt, inet_opt);
- ireq->opt = NULL;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ inet_opt = rcu_dereference(ireq->ireq_opt);
+ RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
@@ -1404,9 +1407,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
- if (*own_req)
+ if (likely(*own_req)) {
tcp_move_syn(newtp, req);
-
+ ireq->ireq_opt = NULL;
+ } else {
+ newinet->inet_opt = NULL;
+ }
return newsk;
exit_overflow:
@@ -1417,6 +1423,7 @@ exit:
tcp_listendrop(sk);
return NULL;
put_and_exit:
+ newinet->inet_opt = NULL;
inet_csk_prepare_forced_close(newsk);
tcp_done(newsk);
goto exit;
@@ -1458,7 +1465,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_rx_dst = NULL;
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ tcp_rcv_established(sk, skb, tcp_hdr(skb));
return 0;
}
@@ -1504,28 +1511,28 @@ csum_err:
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
- return;
+ return 0;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
- return;
+ return 0;
iph = ip_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
- return;
+ return 0;
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
- skb->skb_iif);
+ skb->skb_iif, inet_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
@@ -1539,63 +1546,9 @@ void tcp_v4_early_demux(struct sk_buff *skb)
skb_dst_set_noref(skb, dst);
}
}
+ return 0;
}
-/* Packet is added to VJ-style prequeue for processing in process
- * context, if a reader task is waiting. Apparently, this exciting
- * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
- * failed somewhere. Latency? Burstiness? Well, at least now we will
- * see, why it failed. 8)8) --ANK
- *
- */
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (sysctl_tcp_low_latency || !tp->ucopy.task)
- return false;
-
- if (skb->len <= tcp_hdrlen(skb) &&
- skb_queue_len(&tp->ucopy.prequeue) == 0)
- return false;
-
- /* Before escaping RCU protected region, we need to take care of skb
- * dst. Prequeue is only enabled for established sockets.
- * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
- * Instead of doing full sk_rx_dst validity here, let's perform
- * an optimistic check.
- */
- if (likely(sk->sk_rx_dst))
- skb_dst_drop(skb);
- else
- skb_dst_force_safe(skb);
-
- __skb_queue_tail(&tp->ucopy.prequeue, skb);
- tp->ucopy.memory += skb->truesize;
- if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
- tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- struct sk_buff *skb1;
-
- BUG_ON(sock_owned_by_user(sk));
- __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
- skb_queue_len(&tp->ucopy.prequeue));
-
- while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb1);
-
- tp->ucopy.memory = 0;
- } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
- wake_up_interruptible_sync_poll(sk_sleep(sk),
- POLLIN | POLLRDNORM | POLLRDBAND);
- if (!inet_csk_ack_scheduled(sk))
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- (3 * tcp_rto_min(sk)) / 4,
- TCP_RTO_MAX);
- }
- return true;
-}
-EXPORT_SYMBOL(tcp_prequeue);
-
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1645,6 +1598,7 @@ EXPORT_SYMBOL(tcp_filter);
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ int sdif = inet_sdif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1692,10 +1646,12 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->has_rxtstamp =
+ skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest, &refcounted);
+ th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1722,15 +1678,15 @@ process:
*/
sock_hold(sk);
refcounted = true;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
}
if (nsk == sk) {
reqsk_put(req);
- } else if (tcp_filter(sk, skb)) {
- goto discard_and_relse;
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
@@ -1770,8 +1726,7 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
+ ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
}
@@ -1824,15 +1779,17 @@ do_time_wait:
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
- inet_iif(skb));
+ inet_iif(skb),
+ sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
refcounted = false;
goto process;
}
- /* Fall through to ACK */
}
+ /* to ACK */
+ /* fall through */
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
@@ -1912,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ trace_tcp_destroy_sock(sk);
+
tcp_clear_xmit_timers(sk);
tcp_cleanup_congestion_control(sk);
@@ -1936,9 +1895,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
}
#endif
- /* Clean prequeue, it must be empty really */
- __skb_queue_purge(&tp->ucopy.prequeue);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
@@ -1947,6 +1903,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
+ tcp_fastopen_destroy_cipher(sk);
tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
@@ -2452,8 +2409,8 @@ struct proto tcp_prot = {
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -2473,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
{
int cpu;
+ module_put(net->ipv4.tcp_congestion_control->owner);
+
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
@@ -2527,6 +2486,50 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
+ net->ipv4.sysctl_tcp_early_retrans = 3;
+ net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
+ net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
+ net->ipv4.sysctl_tcp_retrans_collapse = 1;
+ net->ipv4.sysctl_tcp_max_reordering = 300;
+ net->ipv4.sysctl_tcp_dsack = 1;
+ net->ipv4.sysctl_tcp_app_win = 31;
+ net->ipv4.sysctl_tcp_adv_win_scale = 1;
+ net->ipv4.sysctl_tcp_frto = 2;
+ net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+ /* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume. Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+ net->ipv4.sysctl_tcp_tso_win_divisor = 3;
+ /* Default TSQ limit of four TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* rfc5961 challenge ack rate limiting */
+ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
+ net->ipv4.sysctl_tcp_min_tso_segs = 2;
+ net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
+ net->ipv4.sysctl_tcp_autocorking = 1;
+ net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
+ net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
+ net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
+ if (net != &init_net) {
+ memcpy(net->ipv4.sysctl_tcp_rmem,
+ init_net.ipv4.sysctl_tcp_rmem,
+ sizeof(init_net.ipv4.sysctl_tcp_rmem));
+ memcpy(net->ipv4.sysctl_tcp_wmem,
+ init_net.ipv4.sysctl_tcp_wmem,
+ sizeof(init_net.ipv4.sysctl_tcp_wmem));
+ }
+ net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+ spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
+ /* Reno is always built in */
+ if (!net_eq(net, &init_net) &&
+ try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
+ else
+ net->ipv4.tcp_congestion_control = &tcp_reno;
return 0;
fail:
@@ -2537,7 +2540,12 @@ fail:
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
+ struct net *net;
+
inet_twsk_purge(&tcp_hashinfo, AF_INET);
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ tcp_fastopen_ctx_destroy(net);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 102b2c90bb80..7097f92d16e5 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
@@ -20,8 +21,6 @@
#include <net/tcp.h>
#include <net/genetlink.h>
-int sysctl_tcp_nometrics_save __read_mostly;
-
static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
const struct inetpeer_addr *daddr,
struct net *net, unsigned int hash);
@@ -330,7 +329,7 @@ void tcp_update_metrics(struct sock *sk)
int m;
sk_dst_confirm(sk);
- if (sysctl_tcp_nometrics_save || !dst)
+ if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
return;
rcu_read_lock();
@@ -471,10 +470,8 @@ void tcp_init_metrics(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
- if (val && tp->reordering != val) {
- tcp_disable_fack(tp);
+ if (val && tp->reordering != val)
tp->reordering = val;
- }
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
@@ -892,10 +889,14 @@ static void tcp_metrics_flush_all(struct net *net)
for (row = 0; row < max_rows; row++, hb++) {
struct tcp_metrics_block __rcu **pp;
+ bool match;
+
spin_lock_bh(&tcp_metrics_lock);
pp = &hb->chain;
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
- if (net_eq(tm_net(tm), net)) {
+ match = net ? net_eq(tm_net(tm), net) :
+ !atomic_read(&tm_net(tm)->count);
+ if (match) {
*pp = tm->tcpm_next;
kfree_rcu(tm, rcu_head);
} else {
@@ -1018,14 +1019,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
return 0;
}
-static void __net_exit tcp_net_metrics_exit(struct net *net)
+static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
{
- tcp_metrics_flush_all(net);
+ tcp_metrics_flush_all(NULL);
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
- .init = tcp_net_metrics_init,
- .exit = tcp_net_metrics_exit,
+ .init = tcp_net_metrics_init,
+ .exit_batch = tcp_net_metrics_exit_batch,
};
void __init tcp_metrics_init(void)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..e36eff0403f4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -23,13 +23,12 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
+#include <linux/static_key.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
-int sysctl_tcp_abort_on_overflow __read_mostly;
-
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
@@ -180,7 +179,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
- if (sysctl_tcp_rfc1337 == 0) {
+ if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) {
kill:
inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
@@ -298,8 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
key = tp->af_specific->md5_lookup(sk, sk);
if (key) {
tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
- if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
- BUG();
+ BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
}
} while (0);
#endif
@@ -371,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
full_space = rcv_wnd * mss;
/* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(full_space,
+ tcp_select_initial_window(sk_listener, full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
@@ -417,6 +415,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
}
EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+ struct request_sock *req,
+ struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ struct inet_request_sock *ireq;
+
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ ireq = inet_rsk(req);
+ if (oldtp->syn_smc && !ireq->smc_ok)
+ newtp->syn_smc = 0;
+ }
+#endif
+}
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -434,6 +447,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
+ struct tcp_sock *oldtp = tcp_sk(sk);
+
+ smc_check_reset_syn_req(oldtp, req, newtp);
/* Now setup tcp_sock */
newtp->pred_flags = 0;
@@ -445,8 +461,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
- tcp_prequeue_init(newtp);
INIT_LIST_HEAD(&newtp->tsq_node);
+ INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
tcp_init_wl(newtp, treq->rcv_isn);
@@ -459,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->packets_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
- newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
newtp->tlp_high_seq = 0;
newtp->lsndtime = tcp_jiffies32;
@@ -493,10 +508,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
- if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
- if (sysctl_tcp_fack)
- tcp_enable_fack(newtp);
- }
+ newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
newtp->rcv_wnd = req->rsk_rcv_wnd;
@@ -535,6 +547,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->syn_data_acked = 0;
newtp->rack.mstamp = 0;
newtp->rack.advanced = 0;
+ newtp->rack.reo_wnd_steps = 1;
+ newtp->rack.last_delivered = 0;
+ newtp->rack.reo_wnd_persist = 0;
+ newtp->rack.dsack_seen = 0;
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
@@ -765,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
- if (!sysctl_tcp_abort_on_overflow) {
+ if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
inet_rsk(req)->acked = 1;
return NULL;
}
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 6d650ed3cb59..0b5a05bd82e3 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -39,7 +39,7 @@
* nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
* nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8
* nv_rtt_factor RTT averaging factor
- * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur
+ * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur
* nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd
* nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd
* nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping
@@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2;
static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
-static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */
+static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */
static int nv_cwnd_growth_rate_neg __read_mostly = 8;
static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
static int nv_dec_eval_min_calls __read_mostly = 60;
@@ -86,7 +86,6 @@ struct tcpnv {
* < 0 => less than 1 packet/RTT */
u8 available8;
u16 available16;
- u32 loss_cwnd; /* cwnd at last loss */
u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */
nv_reset:1, /* whether to reset values */
nv_catchup:1; /* whether we are growing because
@@ -102,6 +101,11 @@ struct tcpnv {
u32 nv_last_rtt; /* last rtt */
u32 nv_min_rtt; /* active min rtt. Used to determine slope */
u32 nv_min_rtt_new; /* min rtt for future use */
+ u32 nv_base_rtt; /* If non-zero it represents the threshold for
+ * congestion */
+ u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is
+ * set to 80% of nv_base_rtt. It helps reduce
+ * unfairness between flows */
u32 nv_rtt_max_rate; /* max rate seen during current RTT */
u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
* acking beyond nv_rtt_start_seq */
@@ -121,7 +125,6 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
ca->nv_reset = 0;
- ca->loss_cwnd = 0;
ca->nv_no_cong_cnt = 0;
ca->nv_rtt_cnt = 0;
ca->nv_last_rtt = 0;
@@ -134,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
static void tcpnv_init(struct sock *sk)
{
struct tcpnv *ca = inet_csk_ca(sk);
+ int base_rtt;
tcpnv_reset(ca, sk);
+ /* See if base_rtt is available from socket_ops bpf program.
+ * It is meant to be used in environments, such as communication
+ * within a datacenter, where we have reasonable estimates of
+ * RTTs
+ */
+ base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+ if (base_rtt > 0) {
+ ca->nv_base_rtt = base_rtt;
+ ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
+ } else {
+ ca->nv_base_rtt = 0;
+ ca->nv_lower_bound_rtt = 0;
+ }
+
ca->nv_allow_cwnd_growth = 1;
ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
ca->nv_min_rtt = NV_INIT_RTT;
@@ -146,6 +164,19 @@ static void tcpnv_init(struct sock *sk)
ca->cwnd_growth_factor = 0;
}
+/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt)
+ * bounds to RTT.
+ */
+inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
+{
+ if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt)
+ return ca->nv_lower_bound_rtt;
+ else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt)
+ return ca->nv_base_rtt;
+ else
+ return val;
+}
+
static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -177,19 +208,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcpnv_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- struct tcpnv *ca = inet_csk_ca(sk);
- ca->loss_cwnd = tp->snd_cwnd;
return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
}
-static u32 tcpnv_undo_cwnd(struct sock *sk)
-{
- struct tcpnv *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static void tcpnv_state(struct sock *sk, u8 new_state)
{
struct tcpnv *ca = inet_csk_ca(sk);
@@ -220,7 +242,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
struct tcp_sock *tp = tcp_sk(sk);
struct tcpnv *ca = inet_csk_ca(sk);
unsigned long now = jiffies;
- s64 rate64 = 0;
+ u64 rate64;
u32 rate, max_win, cwnd_by_slope;
u32 avg_rtt;
u32 bytes_acked = 0;
@@ -262,8 +284,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
}
/* rate in 100's bits per second */
- rate64 = ((u64)sample->in_flight) * 8000000;
- rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100));
+ rate64 = ((u64)sample->in_flight) * 80000;
+ do_div(rate64, avg_rtt ?: 1);
+ rate = (u32)rate64;
/* Remember the maximum rate seen during this RTT
* Note: It may be more than one RTT. This function should be
@@ -276,6 +299,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->nv_eval_call_cnt < 255)
ca->nv_eval_call_cnt++;
+ /* Apply bounds to rtt. Only used to update min_rtt */
+ avg_rtt = nv_get_bounded_rtt(ca, avg_rtt);
+
/* update min rtt if necessary */
if (avg_rtt < ca->nv_min_rtt)
ca->nv_min_rtt = avg_rtt;
@@ -446,7 +472,7 @@ static struct tcp_congestion_ops tcpnv __read_mostly = {
.ssthresh = tcpnv_recalc_ssthresh,
.cong_avoid = tcpnv_cong_avoid,
.set_state = tcpnv_state,
- .undo_cwnd = tcpnv_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = tcpnv_acked,
.get_info = tcpnv_get_info,
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 11f69bbf9307..b6a2aa1dcf56 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
* is freed by GSO engine
*/
if (copy_destructor) {
+ int delta;
+
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
sum_truesize += skb->truesize;
- refcount_add(sum_truesize - gso_skb->truesize,
- &skb->sk->sk_wmem_alloc);
+ delta = sum_truesize - gso_skb->truesize;
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &skb->sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4e985dea1dd2..a4d214c7b506 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,40 +41,25 @@
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
+#include <linux/static_key.h>
-/* People can turn this off for buggy TCP's found in printers etc. */
-int sysctl_tcp_retrans_collapse __read_mostly = 1;
-
-/* People can turn this on to work with those rare, broken TCPs that
- * interpret the window field as a signed quantity.
- */
-int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-
-/* Default TSQ limit of four TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
-
-/* This limits the percentage of the congestion window which we
- * will allow a single TSO frame to consume. Building TSO frames
- * which are too large can cause TCP streams to be bursty.
- */
-int sysctl_tcp_tso_win_divisor __read_mostly = 3;
-
-/* By default, RFC2861 behavior. */
-int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+#include <trace/events/tcp.h>
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ __skb_unlink(skb, &sk->sk_write_queue);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss)
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
-void tcp_select_initial_window(int __space, __u32 mss,
+void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
@@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
- if (sysctl_tcp_workaround_signed_windows)
+ if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = space;
@@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
(*rcv_wscale) = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
- space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
@@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk)
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
- if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+ if (!tp->rx_opt.rcv_wscale &&
+ sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum = 0;
TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
@@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+#define OPTION_SMC (1 << 9)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (unlikely(OPTION_SMC & *options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_EXP << 8) |
+ (TCPOLEN_EXP_SMC_BASE));
+ *ptr++ = htonl(TCPOPT_SMC_MAGIC);
+ }
+ }
+#endif
+}
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
@@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
ptr += (len + 3) >> 2;
}
+
+ smc_options_write(ptr, &options);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (tp->syn_smc) {
+ if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+ }
+ }
+ }
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+ const struct inet_request_sock *ireq,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (tp->syn_smc && ireq->smc_ok) {
+ if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+ }
+ }
+ }
+#endif
}
/* Compute TCP options for SYN packets. This is not the final
@@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ smc_set_option(tp, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct request_sock *req,
+static unsigned int tcp_synack_options(const struct sock *sk,
+ struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
@@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
}
+ smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -739,8 +780,10 @@ static void tcp_tsq_handler(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tp->lost_out > tp->retrans_out &&
- tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tp->snd_cwnd > tcp_packets_in_flight(tp)) {
+ tcp_mstamp_refresh(tp);
tcp_xmit_retransmit_queue(sk);
+ }
tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
@@ -971,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
HRTIMER_MODE_ABS_PINNED);
}
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ skb->skb_mstamp = tp->tcp_mstamp;
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -991,6 +1040,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
+ struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
@@ -998,19 +1048,22 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
- skb->skb_mstamp = tp->tcp_mstamp;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
- tcp_rate_skb_sent(sk, skb);
+ oskb = skb;
+
+ tcp_skb_tsorted_save(oskb) {
+ if (unlikely(skb_cloned(oskb)))
+ skb = pskb_copy(oskb, gfp_mask);
+ else
+ skb = skb_clone(oskb, gfp_mask);
+ } tcp_skb_tsorted_restore(oskb);
- if (unlikely(skb_cloned(skb)))
- skb = pskb_copy(skb, gfp_mask);
- else
- skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
+ skb->skb_mstamp = tp->tcp_mstamp;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
@@ -1122,12 +1175,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
- if (likely(err <= 0))
- return err;
-
- tcp_enter_cwr(sk);
-
- return net_xmit_eval(err);
+ if (unlikely(err > 0)) {
+ tcp_enter_cwr(sk);
+ err = net_xmit_eval(err);
+ }
+ if (!err && oskb) {
+ tcp_update_skb_after_send(tp, oskb);
+ tcp_rate_skb_sent(sk, oskb);
+ }
+ return err;
}
/* This routine just queues the buffer for sending.
@@ -1162,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
}
}
-/* When a modification to fackets out becomes necessary, we need to check
- * skb is counted to fackets_out or not.
- */
-static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
- int decr)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!tp->sacked_out || tcp_is_reno(tp))
- return;
-
- if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
- tp->fackets_out -= decr;
-}
-
/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
@@ -1197,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
if (tcp_is_reno(tp) && decr > 0)
tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
- tcp_adjust_fackets_out(sk, skb, decr);
-
if (tp->lost_skb_hint &&
before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
- (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
tp->lost_cnt_hint -= decr;
tcp_verify_left_out(tp);
@@ -1236,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
TCP_SKB_CB(skb)->eor = 0;
}
+/* Insert buff after skb on the write or rtx queue of sk. */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+ struct sk_buff *buff,
+ struct sock *sk,
+ enum tcp_queue tcp_queue)
+{
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+ __skb_queue_after(&sk->sk_write_queue, skb, buff);
+ else
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1324,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk);
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
+ list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
return 0;
}
@@ -1602,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if (sysctl_tcp_slow_start_after_idle &&
+ if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
@@ -1611,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
* is caused by insufficient sender buffer:
* 1) just sent some data (see tcp_write_xmit)
* 2) not cwnd limited (this else condition)
- * 3) no more data to send (null tcp_send_head )
+ * 3) no more data to send (tcp_write_queue_empty())
* 4) application is hitting buffer limit (SOCK_NOSPACE)
*/
- if (!tcp_send_head(sk) && sk->sk_socket &&
+ if (tcp_write_queue_empty(sk) && sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1666,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{
u32 bytes, segs;
- bytes = min(sk->sk_pacing_rate >> 10,
+ bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
@@ -1689,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
return tso_segs ? :
- tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+ tcp_tso_autosize(sk, mss_now,
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
}
/* Returns the portion of skb which can be sent right away */
@@ -1803,40 +1858,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
return !after(end_seq, tcp_wnd_end(tp));
}
-/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
- * should be put on the wire right now. If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- unsigned int cwnd_quota;
-
- tcp_init_tso_segs(skb, cur_mss);
-
- if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
- return 0;
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
- cwnd_quota = 0;
-
- return cwnd_quota;
-}
-
-/* Test if sending is allowed right now. */
-bool tcp_may_send_now(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_send_head(sk);
-
- return skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk),
- (tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH));
-}
-
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
@@ -1844,7 +1865,8 @@ bool tcp_may_send_now(struct sock *sk)
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
struct sk_buff *buff;
@@ -1853,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, skb, len, mss_now, gfp);
+ return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
@@ -1889,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk);
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
return 0;
}
@@ -1939,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
- win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+ win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1959,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
}
- head = tcp_write_queue_head(sk);
-
+ /* TODO : use tsorted_sent_queue ? */
+ head = tcp_rtx_queue_head(sk);
+ if (!head)
+ goto send_now;
age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
@@ -2091,6 +2115,7 @@ static int tcp_mtu_probe(struct sock *sk)
nskb->ip_summed = skb->ip_summed;
tcp_insert_write_queue_before(nskb, skb, sk);
+ tcp_highest_sack_replace(sk, skb, nskb);
len = 0;
tcp_for_write_queue_from_safe(skb, next, sk) {
@@ -2173,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
{
unsigned int limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
+ limit = min_t(u32, limit,
+ sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
- /* Always send the 1st or 2nd skb in write queue.
+ /* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
* This helps when TX completions are delayed too much.
*/
- if (skb == sk->sk_write_queue.next ||
- skb->prev == sk->sk_write_queue.next)
+ if (tcp_rtx_queue_empty(sk))
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2202,9 +2227,10 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
const u32 now = tcp_jiffies32;
+ enum tcp_chrono old = tp->chrono_type;
- if (tp->chrono_type > TCP_CHRONO_UNSPEC)
- tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ if (old > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[old - 1] += now - tp->chrono_start;
tp->chrono_start = now;
tp->chrono_type = new;
}
@@ -2234,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
* it's the "most interesting" or current chrono we are
* tracking and starts busy chrono if we have pending data.
*/
- if (tcp_write_queue_empty(sk))
+ if (tcp_rtx_and_write_queues_empty(sk))
tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
else if (type == tp->chrono_type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2267,6 +2293,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
+ tcp_mstamp_refresh(tp);
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
@@ -2278,7 +2305,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
max_segs = tcp_tso_segs(sk, mss_now);
- tcp_mstamp_refresh(tp);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -2290,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
- skb->skb_mstamp = tp->tcp_mstamp;
+ tcp_update_skb_after_send(tp, skb);
goto repair; /* Skip network transmission */
}
@@ -2329,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, limit, mss_now, gfp)))
break;
if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2364,67 +2391,61 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
- tcp_schedule_loss_probe(sk);
+ tcp_schedule_loss_probe(sk, false);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
- return !tp->packets_out && tcp_send_head(sk);
+ return !tp->packets_out && !tcp_write_queue_empty(sk);
}
-bool tcp_schedule_loss_probe(struct sock *sk)
+bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 timeout, tlp_time_stamp, rto_time_stamp;
- u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
+ u32 timeout, rto_delta_us;
+ int early_retrans;
- /* No consecutive loss probes. */
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
- tcp_rearm_rto(sk);
- return false;
- }
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
if (tp->fastopen_rsk)
return false;
- /* TLP is only scheduled when next timer event is RTO. */
- if (icsk->icsk_pending != ICSK_TIME_RETRANS)
- return false;
-
+ early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+ if ((early_retrans != 3 && early_retrans != 4) ||
!tp->packets_out || !tcp_is_sack(tp) ||
icsk->icsk_ca_state != TCP_CA_Open)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
- tcp_send_head(sk))
+ !tcp_write_queue_empty(sk))
return false;
- /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ /* Probe timeout is 2*rtt. Add minimum RTO to account
* for delayed ack when there's one outstanding packet. If no RTT
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
- timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
- if (tp->packets_out == 1)
- timeout = max_t(u32, timeout,
- (rtt + (rtt >> 1) + TCP_DELACK_MAX));
- timeout = max_t(u32, timeout, msecs_to_jiffies(10));
-
- /* If RTO is shorter, just schedule TLP in its place. */
- tlp_time_stamp = tcp_jiffies32 + timeout;
- rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
- if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
- s32 delta = rto_time_stamp - tcp_jiffies32;
- if (delta > 0)
- timeout = delta;
+ if (tp->srtt_us) {
+ timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+ if (tp->packets_out == 1)
+ timeout += TCP_RTO_MIN;
+ else
+ timeout += TCP_TIMEOUT_MIN;
+ } else {
+ timeout = TCP_TIMEOUT_INIT;
}
+ /* If the RTO formula yields an earlier time, then use that time. */
+ rto_delta_us = advancing_rto ?
+ jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
+ tcp_rto_delta_us(sk); /* How far in future is RTO? */
+ if (rto_delta_us > 0)
+ timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
+
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
TCP_RTO_MAX);
return true;
@@ -2456,18 +2477,14 @@ void tcp_send_loss_probe(struct sock *sk)
int mss = tcp_current_mss(sk);
skb = tcp_send_head(sk);
- if (skb) {
- if (tcp_snd_wnd_test(tp, skb, mss)) {
- pcount = tp->packets_out;
- tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
- if (tp->packets_out > pcount)
- goto probe_sent;
- goto rearm_timer;
- }
- skb = tcp_write_queue_prev(sk, skb);
- } else {
- skb = tcp_write_queue_tail(sk);
+ if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+ goto rearm_timer;
}
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
/* At most one outstanding TLP retransmission. */
if (tp->tlp_high_seq)
@@ -2485,10 +2502,11 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
- if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ (pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
- skb = tcp_write_queue_next(sk, skb);
+ skb = skb_rb_next(skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2688,7 +2706,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+ struct sk_buff *next_skb = skb_rb_next(skb);
int skb_size, next_skb_size;
skb_size = skb->len;
@@ -2703,9 +2721,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
else if (!skb_shift(skb, next_skb, next_skb_size))
return false;
}
- tcp_highest_sack_combine(sk, next_skb, skb);
-
- tcp_unlink_write_queue(next_skb, sk);
+ tcp_highest_sack_replace(sk, next_skb, skb);
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2734,7 +2750,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
- sk_wmem_free_skb(sk, next_skb);
+ tcp_rtx_queue_unlink_and_free(next_skb, sk);
return true;
}
@@ -2745,8 +2761,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
return false;
if (skb_cloned(skb))
return false;
- if (skb == tcp_send_head(sk))
- return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2764,12 +2778,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
struct sk_buff *skb = to, *tmp;
bool first = true;
- if (!sysctl_tcp_retrans_collapse)
+ if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
return;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
- tcp_for_write_queue_from_safe(skb, tmp, sk) {
+ skb_rbtree_walk_from_safe(skb, tmp) {
if (!tcp_can_collapse(sk, skb))
break;
@@ -2844,7 +2858,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
len = cur_mss * segs;
if (skb->len > len) {
- if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+ if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+ cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
if (skb_unclone(skb, GFP_ATOMIC))
@@ -2878,16 +2893,23 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
- skb->skb_mstamp = tp->tcp_mstamp;
- nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ tcp_skb_tsorted_save(skb) {
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } tcp_skb_tsorted_restore(skb);
+
+ if (!err) {
+ tcp_update_skb_after_send(tp, skb);
+ tcp_rate_skb_sent(sk, skb);
+ }
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
if (likely(!err)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2924,36 +2946,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- struct sk_buff *hole = NULL;
u32 max_segs;
int mib_idx;
if (!tp->packets_out)
return;
- if (tp->retransmit_skb_hint) {
- skb = tp->retransmit_skb_hint;
- } else {
- skb = tcp_write_queue_head(sk);
- }
-
+ rtx_head = tcp_rtx_queue_head(sk);
+ skb = tp->retransmit_skb_hint ?: rtx_head;
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
- tcp_for_write_queue_from(skb, sk) {
+ skb_rbtree_walk_from(skb) {
__u8 sacked;
int segs;
- if (skb == tcp_send_head(sk))
- break;
-
if (tcp_pacing_check(sk))
break;
@@ -2998,7 +3009,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk) &&
+ if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
@@ -3040,12 +3051,15 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
- if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+ if (!tskb && tcp_under_memory_pressure(sk))
+ tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+ if (tskb) {
coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (!tcp_send_head(sk)) {
+ if (tcp_write_queue_empty(sk)) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
@@ -3062,6 +3076,7 @@ coalesce:
goto coalesce;
return;
}
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3098,6 +3113,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+ /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
+ * skb here is different to the troublesome skb, so use NULL
+ */
+ trace_tcp_send_reset(sk, NULL);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3110,20 +3130,24 @@ int tcp_send_synack(struct sock *sk)
{
struct sk_buff *skb;
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
- pr_debug("%s: wrong queue state\n", __func__);
+ pr_err("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
- struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ struct sk_buff *nskb;
+
+ tcp_skb_tsorted_save(skb) {
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ } tcp_skb_tsorted_restore(skb);
if (!nskb)
return -ENOMEM;
- tcp_unlink_write_queue(skb, sk);
+ INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
- __tcp_add_write_queue_head(sk, nskb);
- sk_wmem_free_skb(sk, skb);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
@@ -3200,8 +3224,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
- tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
- sizeof(*th);
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+ foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3214,13 +3238,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
skb->mark = ireq->ir_mark;
- /* Setting of flags are superfluous here for callers (and ECE is
- * not even correctly set)
- */
- tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
- TCPHDR_SYN | TCPHDR_ACK);
-
- th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ th->seq = htonl(tcp_rsk(req)->snt_isn);
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
@@ -3307,7 +3326,7 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
- tcp_select_initial_window(tcp_full_space(sk),
+ tcp_select_initial_window(sk, tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
@@ -3346,7 +3365,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
tcb->end_seq += skb->len;
__skb_header_release(skb);
- __tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
tp->write_seq = tcb->end_seq;
@@ -3394,6 +3412,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
int copied = copy_from_iter(skb_put(syn_data, space), space,
&fo->data->msg_iter);
if (unlikely(!copied)) {
+ tcp_skb_tsorted_anchor_cleanup(syn_data);
kfree_skb(syn_data);
goto fallback;
}
@@ -3424,10 +3443,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
if (!err) {
tp->syn_data = (fo->copied > 0);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
+ /* data was not sent, put it in write_queue */
+ __skb_queue_tail(&sk->sk_write_queue, syn_data);
+ tp->packets_out -= tcp_skb_pcount(syn_data);
+
fallback:
/* Send a regular SYN with Fast Open cookie request option */
if (fo->cookie.len > 0)
@@ -3448,6 +3472,10 @@ int tcp_connect(struct sock *sk)
int err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
tcp_connect_init(sk);
if (unlikely(tp->repair)) {
@@ -3464,6 +3492,7 @@ int tcp_connect(struct sock *sk)
tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3476,6 +3505,11 @@ int tcp_connect(struct sock *sk)
*/
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
+ buff = tcp_send_head(sk);
+ if (unlikely(buff)) {
+ tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+ tp->pushed_seq = TCP_SKB_CB(buff)->seq;
+ }
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
@@ -3653,7 +3687,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+ if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(skb, mss);
@@ -3683,7 +3718,7 @@ void tcp_send_probe0(struct sock *sk)
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
- if (tp->packets_out || !tcp_send_head(sk)) {
+ if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
@@ -3724,6 +3759,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
if (unlikely(tcp_passive_fastopen(sk)))
tcp_sk(sk)->total_retrans++;
+ trace_tcp_retransmit_synack(sk, req);
}
return res;
}
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f6c50af24a64..697f4c67b2e3 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void)
* Note: arguments must match tcp_rcv_established()!
*/
static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
+ unsigned int len = skb->len;
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
@@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
BUG();
}
- p->length = skb->len;
+ p->length = len;
p->snd_nxt = tp->snd_nxt;
p->snd_una = tp->snd_una;
p->snd_cwnd = tp->snd_cwnd;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index fe9a493d0208..d3ea89020c69 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,8 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/tcp.h>
#include <net/tcp.h>
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-
static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -45,7 +44,8 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ u32 min_rtt = tcp_min_rtt(tp);
+ struct sk_buff *skb, *n;
u32 reo_wnd;
*reo_timeout = 0;
@@ -55,48 +55,36 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
* to queuing or delayed ACKs.
*/
reo_wnd = 1000;
- if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
- reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+ if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
+ reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
+ reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
+ }
- tcp_for_write_queue(skb, sk) {
+ list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+ tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ s32 remaining;
- if (skb == tcp_send_head(sk))
- break;
-
- /* Skip ones already (s)acked */
- if (!after(scb->end_seq, tp->snd_una) ||
- scb->sacked & TCPCB_SACKED_ACKED)
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
- tp->rack.end_seq, scb->end_seq)) {
- /* Step 3 in draft-cheng-tcpm-rack-00.txt:
- * A packet is lost if its elapsed time is beyond
- * the recent RTT plus the reordering window.
- */
- u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
- skb->skb_mstamp);
- s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
-
- if (remaining < 0) {
- tcp_rack_mark_skb_lost(sk, skb);
- continue;
- }
-
- /* Skip ones marked lost but not yet retransmitted */
- if ((scb->sacked & TCPCB_LOST) &&
- !(scb->sacked & TCPCB_SACKED_RETRANS))
- continue;
+ if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq))
+ break;
+ /* A packet is lost if it has not been s/acked beyond
+ * the recent RTT plus the reordering window.
+ */
+ remaining = tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ if (remaining < 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
+ list_del_init(&skb->tcp_tsorted_anchor);
+ } else {
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
-
- } else if (!(scb->sacked & TCPCB_RETRANS)) {
- /* Original data are sent sequentially so stop early
- * b/c the rest are all sent after rack_sent
- */
- break;
}
}
}
@@ -113,7 +101,7 @@ void tcp_rack_mark_lost(struct sock *sk)
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
- timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
@@ -175,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk);
}
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index f2123075ce6e..addc122f8818 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,10 +15,6 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-struct scalable {
- u32 loss_cwnd;
-};
-
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -36,23 +32,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- struct scalable *ca = inet_csk_ca(sk);
-
- ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
-static u32 tcp_scalable_cwnd_undo(struct sock *sk)
-{
- const struct scalable *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
- .undo_cwnd = tcp_scalable_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c0feeeef962a..16df6dd44b98 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,8 +22,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_thin_linear_timeouts __read_mostly;
-
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
@@ -109,26 +107,23 @@ static int tcp_orphan_retries(struct sock *sk, bool alive)
static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
{
- struct net *net = sock_net(sk);
+ const struct net *net = sock_net(sk);
+ int mss;
/* Black hole detection */
- if (net->ipv4.sysctl_tcp_mtu_probing) {
- if (!icsk->icsk_mtup.enabled) {
- icsk->icsk_mtup.enabled = 1;
- icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- } else {
- struct net *net = sock_net(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- int mss;
-
- mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
- mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
- mss = max(mss, 68 - tp->tcp_header_len);
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- }
+ if (!net->ipv4.sysctl_tcp_mtu_probing)
+ return;
+
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+ } else {
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
+ mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
@@ -156,8 +151,13 @@ static bool retransmits_timed_out(struct sock *sk,
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (unlikely(!start_ts))
- start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+ if (unlikely(!start_ts)) {
+ struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+ if (!head)
+ return false;
+ start_ts = tcp_skb_timestamp(head);
+ }
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- struct sk_buff *skb;
-
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb);
-
- tp->ucopy.memory = 0;
- }
-
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
@@ -295,15 +283,17 @@ out:
*
* Returns: Nothing (void)
*/
-static void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk =
+ from_timer(icsk, t, icsk_delack_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
tcp_delack_timer_handler(sk);
} else {
- inet_csk(sk)->icsk_ack.blocked = 1;
+ icsk->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
@@ -316,11 +306,12 @@ static void tcp_delack_timer(unsigned long data)
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
u32 start_ts;
- if (tp->packets_out || !tcp_send_head(sk)) {
+ if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
return;
}
@@ -333,9 +324,9 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+ start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
- tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+ skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) >
jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -420,7 +411,7 @@ void tcp_retransmit_timer(struct sock *sk)
if (!tp->packets_out)
goto out;
- WARN_ON(tcp_write_queue_empty(sk));
+ WARN_ON(tcp_rtx_queue_empty(sk));
tp->tlp_high_seq = 0;
@@ -453,7 +444,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+ tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
@@ -485,7 +476,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk);
- if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+ if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* do not backoff.
*/
@@ -526,7 +517,7 @@ out_reset_timer:
* linear-timeout retransmissions into a black hole
*/
if (sk->sk_state == TCP_ESTABLISHED &&
- (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+ (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) &&
tcp_stream_is_thin(tp) &&
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0;
@@ -582,9 +573,11 @@ out:
sk_mem_reclaim(sk);
}
-static void tcp_write_timer(unsigned long data)
+static void tcp_write_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk =
+ from_timer(icsk, t, icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
@@ -619,9 +612,9 @@ void tcp_set_keepalive(struct sock *sk, int val)
EXPORT_SYMBOL_GPL(tcp_set_keepalive);
-static void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (struct timer_list *t)
{
- struct sock *sk = (struct sock *) data;
+ struct sock *sk = from_timer(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
@@ -652,13 +645,14 @@ static void tcp_keepalive_timer (unsigned long data)
goto death;
}
- if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+ if (!sock_flag(sk, SOCK_KEEPOPEN) ||
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)))
goto out;
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
- if (tp->packets_out || tcp_send_head(sk))
+ if (tp->packets_out || !tcp_write_queue_empty(sk))
goto resched;
elapsed = keepalive_time_elapsed(tp);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 2417f55374c5..6bb9e14c710a 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -122,14 +122,14 @@ int tcp_set_ulp(struct sock *sk, const char *name)
ulp_ops = __tcp_ulp_find_autoload(name);
if (!ulp_ops)
- err = -ENOENT;
- else
- err = ulp_ops->init(sk);
+ return -ENOENT;
- if (err)
- goto out;
+ err = ulp_ops->init(sk);
+ if (err) {
+ module_put(ulp_ops->owner);
+ return err;
+ }
icsk->icsk_ulp_ops = ulp_ops;
- out:
- return err;
+ return 0;
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 218cfcc77650..ee113ff15fd0 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
{
- return min(tp->snd_ssthresh, tp->snd_cwnd-1);
+ return min(tp->snd_ssthresh, tp->snd_cwnd);
}
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 248cfc0ff9ae..4f24d0e37d9c 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* TCP Vegas congestion control interface
*/
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 76005d4b8dfc..6fcf482d611b 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,7 +30,6 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
- u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -194,7 +193,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
- veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -203,17 +201,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
-static u32 tcp_veno_cwnd_undo(struct sock *sk)
-{
- const struct veno *veno = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
- .undo_cwnd = tcp_veno_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e6ff99c4bd3b..96e829b2e2fc 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,7 +37,6 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
- u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -220,22 +219,14 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
-static u32 tcp_yeah_cwnd_undo(struct sock *sk)
-{
- const struct yeah *yeah = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
-}
-
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
- .undo_cwnd = tcp_yeah_cwnd_undo,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b057653ceca9..e4ff25c947c5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -231,10 +231,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
}
}
- /* Initial allocation may have already happened via setsockopt */
- if (!rcu_access_pointer(sk->sk_reuseport_cb))
- return reuseport_alloc(sk);
- return 0;
+ return reuseport_alloc(sk);
}
/**
@@ -380,8 +377,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif,
- bool exact_dif)
+ __be32 daddr, unsigned short hnum,
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -413,10 +410,15 @@ static int compute_score(struct sock *sk, struct net *net,
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
@@ -436,10 +438,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
- struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
@@ -449,7 +452,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -477,8 +480,8 @@ static struct sock *udp4_lib_lookup2(struct net *net,
* harder than this. -DaveM
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
- __be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable, struct sk_buff *skb)
+ __be16 sport, __be32 daddr, __be16 dport, int dif,
+ int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
@@ -496,7 +499,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
@@ -511,7 +514,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
return result;
@@ -521,7 +524,7 @@ begin:
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -554,7 +557,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable, skb);
+ inet_sdif(skb), udptable, skb);
}
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
@@ -576,7 +579,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -587,7 +590,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup);
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
@@ -597,9 +600,10 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport != rmt_port && inet->inet_dport) ||
(inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
return false;
- if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
return false;
return true;
}
@@ -628,8 +632,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable,
- NULL);
+ iph->saddr, uh->source, skb->dev->ifindex, 0,
+ udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
@@ -802,7 +806,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
- else if (sk->sk_no_check_tx) { /* UDP csum disabled */
+ else if (sk->sk_no_check_tx) { /* UDP csum off */
skb->ip_summed = CHECKSUM_NONE;
goto send;
@@ -1054,7 +1058,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- net_dbg_ratelimited("cork app bug 2\n");
+ net_dbg_ratelimited("socket already corked\n");
err = -EINVAL;
goto out;
}
@@ -1137,7 +1141,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- net_dbg_ratelimited("udp cork app bug 3\n");
+ net_dbg_ratelimited("cork failed\n");
return -EINVAL;
}
@@ -1163,34 +1167,36 @@ out:
return ret;
}
-#if BITS_PER_LONG == 64
+#define UDP_SKB_IS_STATELESS 0x80000000
+
static void udp_set_dev_scratch(struct sk_buff *skb)
{
- struct udp_dev_scratch *scratch;
+ struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
- scratch = (struct udp_dev_scratch *)&skb->dev_scratch;
- scratch->truesize = skb->truesize;
+ scratch->_tsize_state = skb->truesize;
+#if BITS_PER_LONG == 64
scratch->len = skb->len;
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
+#endif
+ /* all head states execept sp (dst, sk, nf) are always cleared by
+ * udp_rcv() and we need to preserve secpath, if present, to eventually
+ * process IP_CMSG_PASSSEC at recvmsg() time
+ */
+ if (likely(!skb_sec_path(skb)))
+ scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
static int udp_skb_truesize(struct sk_buff *skb)
{
- return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize;
-}
-#else
-static void udp_set_dev_scratch(struct sk_buff *skb)
-{
- skb->dev_scratch = skb->truesize;
+ return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
}
-static int udp_skb_truesize(struct sk_buff *skb)
+static bool udp_skb_has_head_state(struct sk_buff *skb)
{
- return skb->dev_scratch;
+ return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
}
-#endif
/* fully reclaim rmem/fwd memory allocated for skb */
static void udp_rmem_release(struct sock *sk, int size, int partial,
@@ -1203,8 +1209,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2) &&
- !skb_queue_empty(&up->reader_queue))
+ if (size < (sk->sk_rcvbuf >> 2))
return;
} else {
size += up->forward_deficit;
@@ -1388,12 +1393,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
unlock_sock_fast(sk, slow);
}
- /* we cleared the head states previously only if the skb lacks any IP
- * options, see __udp_queue_rcv_skb().
+ if (!skb_unref(skb))
+ return;
+
+ /* In the more common cases we cleared the head states previously,
+ * see __udp_queue_rcv_skb().
*/
- if (unlikely(IPCB(skb)->opt.optlen > 0))
+ if (unlikely(udp_skb_has_head_state(skb)))
skb_release_head_state(skb);
- consume_stateless_skb(skb);
+ __consume_stateless_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_consume_udp);
@@ -1576,7 +1584,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
- peeking = off = sk_peek_offset(sk, flags);
+ peeking = flags & MSG_PEEK;
+ off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -1784,13 +1793,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id_once(sk, skb);
}
- /* At recvmsg() time we need skb->dst to process IP options-related
- * cmsg, elsewhere can we clear all pending head states while they are
- * hot in the cache
- */
- if (likely(IPCB(skb)->opt.optlen == 0))
- skb_release_head_state(skb);
-
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1811,8 +1813,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
static struct static_key udp_encap_needed __read_mostly;
void udp_encap_enable(void)
{
- if (!static_key_enabled(&udp_encap_needed))
- static_key_slow_inc(&udp_encap_needed);
+ static_key_enable(&udp_encap_needed);
}
EXPORT_SYMBOL(udp_encap_enable);
@@ -1851,7 +1852,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
/* if we're overly short, let UDP handle it */
- encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ encap_rcv = READ_ONCE(up->encap_rcv);
if (encap_rcv) {
int ret;
@@ -1930,15 +1931,18 @@ drop:
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
-static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
struct dst_entry *old;
if (dst_hold_safe(dst)) {
old = xchg(&sk->sk_rx_dst, dst);
dst_release(old);
+ return old != dst;
}
+ return false;
}
+EXPORT_SYMBOL(udp_sk_rx_dst_set);
/*
* Multicasts and broadcasts go to each listener.
@@ -1957,6 +1961,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
unsigned int offset = offsetof(typeof(*sk), sk_node);
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
@@ -1971,7 +1976,7 @@ start_lookup:
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif, hnum))
continue;
if (!first) {
@@ -2161,7 +2166,7 @@ drop:
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(loc_port);
@@ -2175,7 +2180,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
result = NULL;
sk_for_each_rcu(sk, &hslot->head) {
if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
- rmt_port, rmt_addr, dif, hnum)) {
+ rmt_port, rmt_addr, dif, sdif, hnum)) {
if (result)
return NULL;
result = sk;
@@ -2192,7 +2197,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
@@ -2204,7 +2209,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif))
+ loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -2212,47 +2217,46 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
-void udp_v4_early_demux(struct sk_buff *skb)
+int udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct in_device *in_dev = NULL;
const struct iphdr *iph;
const struct udphdr *uh;
struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
int ours;
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return;
+ return 0;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
- if (skb->pkt_type == PACKET_BROADCAST ||
- skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ if (skb->pkt_type == PACKET_MULTICAST) {
+ in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
- return;
+ return 0;
- /* we are supposed to accept bcast packets */
- if (skb->pkt_type == PACKET_MULTICAST) {
- ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
- iph->protocol);
- if (!ours)
- return;
- }
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return 0;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr,
+ dif, sdif);
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr, dif, sdif);
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
- return;
+ return 0;
skb->sk = sk;
skb->destructor = sock_efree;
@@ -2261,12 +2265,23 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
+ u32 itag = 0;
+
/* set noref for now.
* any place which wants to hold dst has to call
* dst_hold_safe()
*/
skb_dst_set_noref(skb, dst);
+
+ /* for unconnected multicast sockets we need to validate
+ * the source on each packet
+ */
+ if (!inet_sk(sk)->inet_daddr && in_dev)
+ return ip_mc_validate_source(skb, iph->daddr,
+ iph->saddr, iph->tos,
+ skb->dev, in_dev, &itag);
}
+ return 0;
}
int udp_rcv(struct sk_buff *skb)
@@ -2282,7 +2297,7 @@ void udp_destroy_sock(struct sock *sk)
unlock_sock_fast(sk, slow);
if (static_key_false(&udp_encap_needed) && up->encap_type) {
void (*encap_destroy)(struct sock *sk);
- encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ encap_destroy = READ_ONCE(up->encap_destroy);
if (encap_destroy)
encap_destroy(sk);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4515836d2a3a..d0390d844ac8 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -45,7 +45,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
sk = __udp6_lib_lookup(net,
@@ -53,7 +53,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#endif
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_dst[0], req->id.idiag_dport,
req->id.idiag_src[0], req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
@@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_dst[3], req->id.idiag_dport,
req->id.idiag_src[3], req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
else
sk = __udp6_lib_lookup(net,
@@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
- req->id.idiag_if, tbl, NULL);
+ req->id.idiag_if, 0, tbl, NULL);
}
#endif
else {
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index a8cf8c6fb60c..e7d18b140287 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP4_IMPL_H
#define _UDP4_IMPL_H
#include <net/udp.h>
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 781250151d40..01801b77bd0d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo, gso_partial;
+ bool remcsum, need_csum, offload_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -61,8 +61,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
- ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
@@ -77,7 +75,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* outer one so strip the existing checksum feature flags and
* instead set the flag based on our outer checksum offload value.
*/
- if (remcsum || ufo) {
+ if (remcsum) {
features &= ~NETIF_F_CSUM_MASK;
if (!need_csum || offload_csum)
features |= NETIF_F_HW_CSUM;
@@ -122,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (gso_partial) {
+ if (gso_partial && skb_is_gso(skb)) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);
@@ -212,15 +210,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (unlikely(skb->len <= mss))
goto out;
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
-
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
- segs = NULL;
- goto out;
- }
-
/* Do software UFO. Complete and fill in the UDP checksum as
* HW cannot do checksum of UDP packets sent as multiple
* IP fragments.
@@ -235,7 +224,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
/* If there is no outer header we can fake a checksum offload
* due to the fact that we have already done the checksum in
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 58bd39fb14b4..6539ff15e9a3 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -82,7 +82,8 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ if (!dev->netdev_ops->ndo_udp_tunnel_add ||
+ !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
return;
ti.type = type;
@@ -93,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
}
EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ if (!dev->netdev_ops->ndo_udp_tunnel_del ||
+ !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
+
/* Notify netdevs that UDP port started listening */
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
{
@@ -109,6 +128,8 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
for_each_netdev_rcu(net, dev) {
if (!dev->netdev_ops->ndo_udp_tunnel_add)
continue;
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ continue;
dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
}
rcu_read_unlock();
@@ -131,6 +152,8 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
for_each_netdev_rcu(net, dev) {
if (!dev->netdev_ops->ndo_udp_tunnel_del)
continue;
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ continue;
dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
}
rcu_read_unlock();
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 1fc684111ce6..e50b7fea57ee 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* xfrm4_input.c
*
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 71b4ecc195c7..05017e2c849c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* xfrm4_policy.c
*
@@ -20,7 +21,8 @@
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr)
+ const xfrm_address_t *daddr,
+ u32 mark)
{
struct rtable *rt;
@@ -28,6 +30,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+ fl4->flowi4_mark = mark;
if (saddr)
fl4->saddr = saddr->a4;
@@ -42,20 +45,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr)
+ const xfrm_address_t *daddr,
+ u32 mark)
{
struct flowi4 fl4;
- return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
+ return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
}
static int xfrm4_get_saddr(struct net *net, int oif,
- xfrm_address_t *saddr, xfrm_address_t *daddr)
+ xfrm_address_t *saddr, xfrm_address_t *daddr,
+ u32 mark)
{
struct dst_entry *dst;
struct flowi4 fl4;
- dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
if (IS_ERR(dst))
return -EHOSTUNREACH;
@@ -213,14 +218,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
fl4->flowi4_tos = iph->tos;
}
-static inline int xfrm4_garbage_collect(struct dst_ops *ops)
-{
- struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
-
- xfrm_garbage_collect_deferred(net);
- return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
-}
-
static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
@@ -259,14 +256,13 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
- .gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
.cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = INT_MAX,
+ .gc_thresh = 32768,
};
static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index d6660a8c0ea5..80c40b4981bb 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* xfrm4_state.c
*
OpenPOWER on IntegriCloud