summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_gre.c58
-rw-r--r--net/ipv4/netfilter/arp_tables.c22
-rw-r--r--net/ipv4/netfilter/ip_tables.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c28
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c16
-rw-r--r--net/ipv4/sysctl_net_ipv4.c51
-rw-r--r--net/ipv4/tcp.c24
-rw-r--r--net/ipv4/tcp_cong.c76
-rw-r--r--net/ipv4/tcp_input.c236
-rw-r--r--net/ipv4/tcp_ipv4.c22
-rw-r--r--net/ipv4/tcp_metrics.c4
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c12
-rw-r--r--net/ipv4/tcp_output.c38
-rw-r--r--net/ipv4/udp.c4
22 files changed, 329 insertions, 307 deletions
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 37db44f60718..4dd95cdd8070 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -240,7 +240,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
if (err == -EINPROGRESS)
goto out;
- if (err == -EBUSY)
+ if (err == -ENOSPC)
err = NET_XMIT_DROP;
goto out_free;
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b00e4a43b4dc..d57aa64fa7c7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -432,7 +432,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
case -EINPROGRESS:
goto error;
- case -EBUSY:
+ case -ENOSPC:
err = NET_XMIT_DROP;
break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 589caaa90613..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
bool ecn_ca = false;
nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
} else {
val = nla_get_u32(nla);
}
@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 7f3ef5c287a1..26a3d0315728 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work)
local_bh_disable();
- for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ for (i = READ_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
if (evicted > INETFRAGS_EVICT_MAX)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a4bab81f1462..c690cd0d9b3f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
*/
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/inet_hashtables.h>
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
if (tw) {
const struct inet_sock *inet = inet_sk(sk);
- kmemcheck_annotate_bitfield(tw, flags);
-
tw->tw_dr = dr;
/* Give us an identity. */
tw->tw_daddr = inet->inet_daddr;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c105a315b1a3..bb6239169b1a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -773,20 +773,46 @@ free_skb:
return NETDEV_TX_OK;
}
+static void ipgre_link_update(struct net_device *dev, bool set_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int len;
+
+ len = tunnel->tun_hlen;
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ len = tunnel->tun_hlen - len;
+ tunnel->hlen = tunnel->hlen + len;
+
+ dev->needed_headroom = dev->needed_headroom + len;
+ if (set_mtu)
+ dev->mtu = max_t(int, dev->mtu - len, 68);
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
+ tunnel->encap.type == TUNNEL_ENCAP_NONE) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
static int ipgre_tunnel_ioctl(struct net_device *dev,
struct ifreq *ifr, int cmd)
{
- int err;
struct ip_tunnel_parm p;
+ int err;
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
return -EFAULT;
+
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
+ ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
}
+
p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
@@ -794,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
if (err)
return err;
+ if (cmd == SIOCCHGTUNNEL) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
+
+ if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+ ipgre_link_update(dev, true);
+ }
+
p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
return -EFAULT;
+
return 0;
}
@@ -1307,9 +1344,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
__u32 fwmark = t->fwmark;
+ struct ip_tunnel_parm p;
int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
@@ -1322,7 +1359,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_changelink(dev, tb, &p, fwmark);
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
+
+ if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
}
static size_t ipgre_get_size(const struct net_device *dev)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 9e2770fd00be..f88221aebc9d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t,
}
}
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct arpt_entry *iter;
+ unsigned int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i;
+ }
+ cond_resched();
+ }
+}
+
static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
@@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters, and synchronize with replace */
- get_counters(oldinfo, counters);
+ get_old_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 39286e543ee6..4cbe5e80f3bf 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t,
}
}
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ipt_entry *iter;
+ unsigned int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ const struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i; /* macro does multi eval of i */
+ }
+
+ cond_resched();
+ }
+}
+
static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
@@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters, and synchronize with replace */
- get_counters(oldinfo, counters);
+ get_old_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fe374da4bc13..89af9d88ca21 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net)
mutex_unlock(&register_ipv4_hooks);
}
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
.l3proto = PF_INET,
.pkt_to_tuple = ipv4_pkt_to_tuple,
.invert_tuple = ipv4_invert_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a046c298413a..1849fedd9b81 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- u_int8_t pf,
unsigned int *timeout)
{
/* Do not immediately delete the connection after the first
@@ -165,6 +164,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return NF_ACCEPT;
}
+static void icmp_error_log(const struct sk_buff *skb, struct net *net,
+ u8 pf, const char *msg)
+{
+ nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
+}
+
/* Small and modified version of icmp_rcv */
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
@@ -177,18 +182,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
/* Not enough header? */
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
- NULL, "nf_ct_icmp: short packet ");
+ icmp_error_log(skb, net, pf, "short packet");
return -NF_ACCEPT;
}
/* See ip_conntrack_proto_tcp.c */
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: bad HW ICMP checksum ");
+ icmp_error_log(skb, net, pf, "bad hw icmp checksum");
return -NF_ACCEPT;
}
@@ -199,9 +200,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
* discarded.
*/
if (icmph->type > NR_ICMP_TYPES) {
- if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: invalid ICMP type ");
+ icmp_error_log(skb, net, pf, "invalid icmp type");
return -NF_ACCEPT;
}
@@ -259,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[],
return 0;
}
-static int icmp_nlattr_tuple_size(void)
+static unsigned int icmp_nlattr_tuple_size(void)
{
- return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+ static unsigned int size __read_mostly;
+
+ if (!size)
+ size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+
+ return size;
}
#endif
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 127153f1ed8a..9f37c4727861 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
- SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bc40bd411196..43b69af242e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs)
{
u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(*p_tstamp);
+ u32 old = READ_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
u32 new, delta = 0;
@@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
struct rtable *rt;
+ u32 genid, hval;
unsigned int i;
int depth;
- u32 hval = fnhe_hashfun(daddr);
+
+ genid = fnhe_genid(dev_net(nh->nh_dev));
+ hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
@@ -676,12 +679,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
}
if (fnhe) {
+ if (fnhe->fnhe_genid != genid)
+ fnhe->fnhe_genid = genid;
if (gw)
fnhe->fnhe_gw = gw;
- if (pmtu) {
+ if (pmtu)
fnhe->fnhe_pmtu = pmtu;
- fnhe->fnhe_expires = max(1UL, expires);
- }
+ fnhe->fnhe_expires = max(1UL, expires);
/* Update all cached dsts too */
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt)
@@ -700,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_next = hash->chain;
rcu_assign_pointer(hash->chain, fnhe);
}
- fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+ fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a82b44038308..93e172118a94 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = container_of(ctl->data, struct net,
+ ipv4.tcp_congestion_control);
char val[TCP_CA_NAME_MAX];
struct ctl_table tbl = {
.data = val,
@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
};
int ret;
- tcp_get_default_congestion_control(val);
+ tcp_get_default_congestion_control(net, val);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0)
- ret = tcp_set_default_congestion_control(val);
+ ret = tcp_set_default_congestion_control(net, val);
return ret;
}
@@ -441,34 +443,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
- .procname = "tcp_wmem",
- .data = &sysctl_tcp_wmem,
- .maxlen = sizeof(sysctl_tcp_wmem),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- },
- {
- .procname = "tcp_rmem",
- .data = &sysctl_tcp_rmem,
- .maxlen = sizeof(sysctl_tcp_rmem),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- },
- {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
- {
- .procname = "tcp_congestion_control",
- .mode = 0644,
- .maxlen = TCP_CA_NAME_MAX,
- .proc_handler = proc_tcp_congestion_control,
- },
#ifdef CONFIG_NETLABEL
{
.procname = "cipso_cache_enable",
@@ -780,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = {
},
#endif
{
+ .procname = "tcp_congestion_control",
+ .data = &init_net.ipv4.tcp_congestion_control,
+ .mode = 0644,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = proc_tcp_congestion_control,
+ },
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
@@ -1164,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = &zero,
.extra2 = &thousand,
},
+ {
+ .procname = "tcp_wmem",
+ .data = &init_net.ipv4.sysctl_tcp_wmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_rmem",
+ .data = &init_net.ipv4.sysctl_tcp_rmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4cb19ed4628..bf97317e6c97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
-
EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
@@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk_sockets_allocated_inc(sk);
}
@@ -2514,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
return -EINVAL;
tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
- if (sock_net(sk)->ipv4.sysctl_tcp_fack)
- tcp_enable_fack(tp);
break;
case TCPOPT_TIMESTAMP:
if (opt.opt_val != 0)
@@ -2984,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
- info->tcpi_fackets = tp->fackets_out;
now = tcp_jiffies32;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
@@ -3636,13 +3628,13 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
- sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+ init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+ init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2f26124fd160..bc6c02f16243 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
}
/* Must be called with rcu lock held */
-static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
+ const char *name)
{
- const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+ struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key = TCP_CA_UNSPEC;
@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
might_sleep();
rcu_read_lock();
- ca = __tcp_ca_find_autoload(name);
+ ca = tcp_ca_find_autoload(net, name);
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
+ struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_congestion_ops *ca;
+ const struct tcp_congestion_ops *ca;
rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (likely(try_module_get(ca->owner))) {
- icsk->icsk_ca_ops = ca;
- goto out;
- }
- /* Fallback to next available. The last really
- * guaranteed fallback is Reno from this list.
- */
- }
-out:
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+ if (unlikely(!try_module_get(ca->owner)))
+ ca = &tcp_reno;
+ icsk->icsk_ca_ops = ca;
rcu_read_unlock();
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
INET_ECN_xmit(sk);
else
@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
}
/* Used by sysctl to change default congestion control */
-int tcp_set_default_congestion_control(const char *name)
+int tcp_set_default_congestion_control(struct net *net, const char *name)
{
struct tcp_congestion_ops *ca;
- int ret = -ENOENT;
-
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
-#ifdef CONFIG_MODULES
- if (!ca && capable(CAP_NET_ADMIN)) {
- spin_unlock(&tcp_cong_list_lock);
+ const struct tcp_congestion_ops *prev;
+ int ret;
- request_module("tcp_%s", name);
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
- }
-#endif
+ rcu_read_lock();
+ ca = tcp_ca_find_autoload(net, name);
+ if (!ca) {
+ ret = -ENOENT;
+ } else if (!try_module_get(ca->owner)) {
+ ret = -EBUSY;
+ } else {
+ prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+ if (prev)
+ module_put(prev->owner);
- if (ca) {
- ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
- list_move(&ca->list, &tcp_cong_list);
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
}
- spin_unlock(&tcp_cong_list_lock);
+ rcu_read_unlock();
return ret;
}
@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
- return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+ return tcp_set_default_congestion_control(&init_net,
+ CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);
@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
}
/* Get current default congestion control */
-void tcp_get_default_congestion_control(char *name)
+void tcp_get_default_congestion_control(struct net *net, char *name)
{
- struct tcp_congestion_ops *ca;
- /* We will always have reno... */
- BUG_ON(list_empty(&tcp_cong_list));
+ const struct tcp_congestion_ops *ca;
rcu_read_lock();
- ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
strncpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
if (!load)
ca = tcp_ca_find(name);
else
- ca = __tcp_ca_find_autoload(name);
+ ca = tcp_ca_find_autoload(sock_net(sk), name);
+
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) {
icsk->icsk_ca_setsockopt = 1;
goto out;
}
+
if (!ca) {
err = -ENOENT;
} else if (!load) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0ada8bfc2ebd..734cfc8ff76e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,7 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
@@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+ sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
- int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1;
+ int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
rcvmem <<= 2;
if (sk->sk_rcvbuf < rcvmem)
- sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+ sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
}
/* 4. Try to fixup all. It is made immediately after connection enters
@@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
icsk->icsk_ack.quick = 0;
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ net->ipv4.sysctl_tcp_rmem[2]);
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
@@ -793,12 +795,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
- /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
- ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
- sk->sk_max_pacing_rate);
+ WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
+ sk->sk_max_pacing_rate));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -840,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/*
- * Packet counting of FACK is based on in-order assumptions, therefore TCP
- * disables it when reordering is detected
- */
-void tcp_disable_fack(struct tcp_sock *tp)
-{
- /* RFC3517 uses different metric in lost marker => reset on change */
- if (tcp_is_fack(tp))
- tp->lost_skb_hint = NULL;
- tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
-}
-
/* Take a notice that peer is sending D-SACKs */
static void tcp_dsack_seen(struct tcp_sock *tp)
{
@@ -859,42 +849,39 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
tp->rack.dsack_seen = 1;
}
-static void tcp_update_reordering(struct sock *sk, const int metric,
- const int ts)
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+ const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mib_idx;
+ const u32 mss = tp->mss_cache;
+ u32 fack, metric;
- if (WARN_ON_ONCE(metric < 0))
+ fack = tcp_highest_sack_seq(tp);
+ if (!before(low_seq, fack))
return;
- if (metric > tp->reordering) {
- tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric);
-
+ metric = fack - low_seq;
+ if ((metric > tp->reordering * mss) && mss) {
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
- tp->fackets_out,
+ 0,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
- tcp_disable_fack(tp);
+ tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+ sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
tp->rack.reord = 1;
-
/* This exciting event is worth to be remembered. 8) */
- if (ts)
- mib_idx = LINUX_MIB_TCPTSREORDER;
- else if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENOREORDER;
- else if (tcp_is_fack(tp))
- mib_idx = LINUX_MIB_TCPFACKREORDER;
- else
- mib_idx = LINUX_MIB_TCPSACKREORDER;
-
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk),
+ ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
/* This must be called before lost_out is incremented */
@@ -968,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
* 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
- * A''. Its FACK modification, head until snd.fack is lost.
* B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1111,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}
struct tcp_sacktag_state {
- int reord;
- int fack_count;
+ u32 reord;
/* Timestamps for earliest and latest never-retransmitted segment
* that was SACKed. RTO needs the earliest RTT to stay conservative,
* but congestion control should still get an accurate delay signal.
@@ -1188,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
- int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
tp->undo_retrans--;
- if (sacked & TCPCB_SACKED_ACKED)
- state->reord = min(fack_count, state->reord);
+ if ((sacked & TCPCB_SACKED_ACKED) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1222,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
* which was in hole. It is reordering.
*/
if (before(start_seq,
- tcp_highest_sack_seq(tp)))
- state->reord = min(fack_count,
- state->reord);
+ tcp_highest_sack_seq(tp)) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
+
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
if (state->first_sackt == 0)
@@ -1243,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
tp->delivered += pcount; /* Out-of-order packets delivered */
- fack_count += pcount;
-
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+ if (tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
-
- if (fack_count > tp->fackets_out)
- tp->fackets_out = fack_count;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1498,7 +1479,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
}
out:
- state->fack_count += pcount;
return prev;
noop:
@@ -1577,8 +1557,6 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
tcp_highest_sack_seq(tp)))
tcp_advance_highest_sack(sk, skb);
}
-
- state->fack_count += tcp_skb_pcount(skb);
}
return skb;
}
@@ -1589,7 +1567,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
{
struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
struct sk_buff *skb;
- int unack_bytes;
while (*p) {
parent = *p;
@@ -1602,12 +1579,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
p = &parent->rb_right;
continue;
}
-
- state->fack_count = 0;
- unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
- if (state->mss_now && unack_bytes > 0)
- state->fack_count = unack_bytes / state->mss_now;
-
return skb;
}
return NULL;
@@ -1665,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int first_sack_index;
state->flag = 0;
- state->reord = tp->packets_out;
+ state->reord = tp->snd_nxt;
- if (!tp->sacked_out) {
- if (WARN_ON(tp->fackets_out))
- tp->fackets_out = 0;
+ if (!tp->sacked_out)
tcp_highest_sack_reset(sk);
- }
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
@@ -1743,7 +1711,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
state->mss_now = tcp_current_mss(sk);
- state->fack_count = 0;
skb = NULL;
i = 0;
@@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
@@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state->fack_count = tp->fackets_out;
}
skb = tcp_sacktag_skip(skb, sk, state, start_seq);
@@ -1836,9 +1801,8 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- if ((state->reord < tp->fackets_out) &&
- ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+ tcp_check_sack_reordering(sk, state->reord, 0);
tcp_verify_left_out(tp);
out:
@@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_limit_reno_sacked(tp))
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
+
+ if (!tcp_limit_reno_sacked(tp))
+ return;
+
+ tp->reordering = min_t(u32, tp->packets_out + addend,
+ sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = -1;
- tp->fackets_out = 0;
tp->sacked_out = 0;
}
@@ -1973,7 +1941,6 @@ void tcp_enter_loss(struct sock *sk)
if (is_reneg) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
- tp->fackets_out = 0;
}
tcp_clear_all_retrans_hints(tp);
@@ -2040,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
return false;
}
-static inline int tcp_fackets_out(const struct tcp_sock *tp)
-{
- return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
-}
-
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
* counter when SACK is enabled (without SACK, sacked_out is used for
* that purpose).
*
- * Instead, with FACK TCP uses fackets_out that includes both SACKed
- * segments up to the highest received SACK block so far and holes in
- * between them.
- *
* With reordering, holes may still be in flight, so RFC3517 recovery
* uses pure sacked_out (total number of SACKed segments) even though
* it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2062,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
*/
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
- return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+ return tp->sacked_out + 1;
}
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
@@ -2130,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* dynamically measured and adjusted. This is implemented in
* tcp_rack_mark_lost.
*
- * FACK (Disabled by default. Subsumbed by RACK):
- * It is the simplest heuristics. As soon as we decided
- * that something is lost, we decide that _all_ not SACKed
- * packets until the most forward SACK are lost. I.e.
- * lost_out = fackets_out - sacked_out and left_out = fackets_out.
- * It is absolutely correct estimate, if network does not reorder
- * packets. And it loses any connection to reality when reordering
- * takes place. We use FACK by default until reordering
- * is suspected on the path to this destination.
- *
* If the receiver does not support SACK:
*
* NewReno (RFC6582): in Recovery we assume that one segment
@@ -2188,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
}
/* Detect loss in event "A" above by marking head of queue up as lost.
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * For non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
@@ -2224,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
oldcnt = cnt;
- if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+ if (tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ if (tcp_is_sack(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
@@ -2260,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
if (tcp_is_reno(tp)) {
tcp_mark_head_lost(sk, 1, 1);
- } else if (tcp_is_fack(tp)) {
- int lost = tp->fackets_out - tp->reordering;
- if (lost <= 0)
- lost = 1;
- tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
@@ -2611,7 +2554,6 @@ void tcp_simple_retransmit(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk);
- u32 prior_lost = tp->lost_out;
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss &&
@@ -2626,7 +2568,7 @@ void tcp_simple_retransmit(struct sock *sk)
tcp_clear_retrans_hints_partial(tp);
- if (prior_lost == tp->lost_out)
+ if (!tp->lost_out)
return;
if (tcp_is_reno(tp))
@@ -2734,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
}
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && tcp_packet_delayed(tp)) {
/* Plain luck! Hole if filled with delayed
- * packet, rather than with a retransmit.
+ * packet, rather than with a retransmit. Check reordering.
*/
- tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+ tcp_check_sack_reordering(sk, prior_snd_una, 1);
/* We are getting evidence that the reordering degree is higher
* than we realized. If there are no retransmits out then we
@@ -2778,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
}
}
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return after(tcp_highest_sack_seq(tp),
+ tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2790,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- (tcp_fackets_out(tp) > tp->reordering));
+ tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
- if (!tp->sacked_out && tp->fackets_out)
- tp->fackets_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2849,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked))
+ if (tcp_try_undo_partial(sk, prior_snd_una))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
- tcp_fackets_out(tp) > tp->reordering;
+ tcp_force_fast_retransmit(sk);
}
if (tcp_try_undo_dsack(sk)) {
tcp_try_keep_open(sk);
@@ -3016,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk)
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
- if (!tcp_schedule_loss_probe(sk))
+ if (!tcp_schedule_loss_probe(sk, true))
tcp_rearm_rto(sk);
}
@@ -3063,15 +3011,15 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, int *acked,
+static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+ u32 prior_snd_una,
struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u64 first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
- u32 reord = tp->packets_out;
+ u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
struct sk_buff *skb, *next;
bool fully_acked = true;
long sack_rtt_us = -1L;
@@ -3086,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ const u32 start_seq = scb->seq;
u8 sacked = scb->sacked;
u32 acked_pcount;
@@ -3116,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt = last_ackt;
last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
- reord = min(pkts_acked, reord);
+ if (before(start_seq, reord))
+ reord = start_seq;
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
}
@@ -3194,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets && reord <= tp->fackets_out)
- tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+ if (before(reord, prior_fack))
+ tcp_check_sack_reordering(sk, reord, 0);
- delta = tcp_is_fack(tp) ? pkts_acked :
- prior_sacked - tp->sacked_out;
+ delta = prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
-
- tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3244,7 +3190,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
}
#endif
- *acked = pkts_acked;
return flag;
}
@@ -3553,12 +3498,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_fackets;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
- int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
@@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
icsk->icsk_retransmits = 0;
}
- prior_fackets = tp->fackets_out;
+ prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
@@ -3646,8 +3590,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
tcp_rack_update_reo_wnd(sk, &rs);
@@ -3659,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
@@ -3675,7 +3619,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3697,7 +3642,8 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ &rexmit);
tcp_xmit_recovery(sk, rexmit);
}
@@ -5706,9 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tp->tcp_header_len = sizeof(struct tcphdr);
}
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack)
- tcp_enable_fack(tp);
-
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -6187,7 +6130,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
- kmemcheck_annotate_bitfield(ireq, flags);
ireq->ireq_opt = NULL;
#if IS_ENABLED(CONFIG_IPV6)
ireq->pktopts = NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0162c577bb9c..c6bc0c4d19c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2409,8 +2409,8 @@ struct proto tcp_prot = {
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
{
int cpu;
+ module_put(net->ipv4.tcp_congestion_control->owner);
+
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
@@ -2509,12 +2511,26 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
-
+ if (net != &init_net) {
+ memcpy(net->ipv4.sysctl_tcp_rmem,
+ init_net.ipv4.sysctl_tcp_rmem,
+ sizeof(init_net.ipv4.sysctl_tcp_rmem));
+ memcpy(net->ipv4.sysctl_tcp_wmem,
+ init_net.ipv4.sysctl_tcp_wmem,
+ sizeof(init_net.ipv4.sysctl_tcp_wmem));
+ }
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+ /* Reno is always built in */
+ if (!net_eq(net, &init_net) &&
+ try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
+ else
+ net->ipv4.tcp_congestion_control = &tcp_reno;
+
return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d5ddebfd831..7097f92d16e5 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
- if (val && tp->reordering != val) {
- tcp_disable_fack(tp);
+ if (val && tp->reordering != val)
tp->reordering = val;
- }
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bb86580decd..e36eff0403f4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -475,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->packets_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
- newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
newtp->tlp_high_seq = 0;
newtp->lsndtime = tcp_jiffies32;
@@ -509,10 +508,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
- if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
- if (sock_net(sk)->ipv4.sysctl_tcp_fack)
- tcp_enable_fack(newtp);
- }
+ newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
newtp->rcv_wnd = req->rsk_rcv_wnd;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 11f69bbf9307..b6a2aa1dcf56 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
* is freed by GSO engine
*/
if (copy_destructor) {
+ int delta;
+
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
sum_truesize += skb->truesize;
- refcount_add(sum_truesize - gso_skb->truesize,
- &skb->sk->sk_wmem_alloc);
+ delta = sum_truesize - gso_skb->truesize;
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &skb->sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a9d917e4dad5..a4d214c7b506 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
(*rcv_wscale) = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
- space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
@@ -1218,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
}
}
-/* When a modification to fackets out becomes necessary, we need to check
- * skb is counted to fackets_out or not.
- */
-static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
- int decr)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!tp->sacked_out || tcp_is_reno(tp))
- return;
-
- if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
- tp->fackets_out -= decr;
-}
-
/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
@@ -1253,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
if (tcp_is_reno(tp) && decr > 0)
tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
- tcp_adjust_fackets_out(sk, skb, decr);
-
if (tp->lost_skb_hint &&
before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
- (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
tp->lost_cnt_hint -= decr;
tcp_verify_left_out(tp);
@@ -1737,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{
u32 bytes, segs;
- bytes = min(sk->sk_pacing_rate >> 10,
+ bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
@@ -1978,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
- win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
+ win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -2215,7 +2198,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
{
unsigned int limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
limit = min_t(u32, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
@@ -2408,7 +2391,7 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
- tcp_schedule_loss_probe(sk);
+ tcp_schedule_loss_probe(sk, false);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
@@ -2416,7 +2399,7 @@ repair:
return !tp->packets_out && !tcp_write_queue_empty(sk);
}
-bool tcp_schedule_loss_probe(struct sock *sk)
+bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2457,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
}
/* If the RTO formula yields an earlier time, then use that time. */
- rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
+ rto_delta_us = advancing_rto ?
+ jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
+ tcp_rto_delta_us(sk); /* How far in future is RTO? */
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
@@ -2961,9 +2946,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a6699af05539..e4ff25c947c5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1852,7 +1852,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
/* if we're overly short, let UDP handle it */
- encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ encap_rcv = READ_ONCE(up->encap_rcv);
if (encap_rcv) {
int ret;
@@ -2297,7 +2297,7 @@ void udp_destroy_sock(struct sock *sk)
unlock_sock_fast(sk, slow);
if (static_key_false(&udp_encap_needed) && up->encap_type) {
void (*encap_destroy)(struct sock *sk);
- encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ encap_destroy = READ_ONCE(up->encap_destroy);
if (encap_destroy)
encap_destroy(sk);
}
OpenPOWER on IntegriCloud