summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig233
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c9
-rw-r--r--net/ipv4/bpf_tcp_ca.c252
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c264
-rw-r--r--net/ipv4/esp4_offload.c2
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_lookup.h8
-rw-r--r--net/ipv4/fib_notifier.c13
-rw-r--r--net/ipv4/fib_rules.c5
-rw-r--r--net/ipv4/fib_semantics.c50
-rw-r--r--net/ipv4/fib_trie.c242
-rw-r--r--net/ipv4/fou.c4
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/gre_offload.c2
-rw-r--r--net/ipv4/icmp.c57
-rw-r--r--net/ipv4/igmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c34
-rw-r--r--net/ipv4/inet_diag.c20
-rw-r--r--net/ipv4/inet_hashtables.c18
-rw-r--r--net/ipv4/inetpeer.c12
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_input.c40
-rw-r--r--net/ipv4/ip_output.c45
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c440
-rw-r--r--net/ipv4/ip_vti.c19
-rw-r--r--net/ipv4/ipconfig.c23
-rw-r--r--net/ipv4/ipmr.c24
-rw-r--r--net/ipv4/ipmr_base.c30
-rw-r--r--net/ipv4/netfilter/Kconfig8
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/arp_tables.c46
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c16
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c10
-rw-r--r--net/ipv4/nexthop.c7
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c181
-rw-r--r--net/ipv4/syncookies.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c24
-rw-r--r--net/ipv4/tcp.c229
-rw-r--r--net/ipv4/tcp_bbr.c20
-rw-r--r--net/ipv4/tcp_bpf.c19
-rw-r--r--net/ipv4/tcp_cong.c22
-rw-r--r--net/ipv4/tcp_cubic.c83
-rw-r--r--net/ipv4/tcp_diag.c61
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_input.c185
-rw-r--r--net/ipv4/tcp_ipv4.c208
-rw-r--r--net/ipv4/tcp_metrics.c13
-rw-r--r--net/ipv4/tcp_minisocks.c22
-rw-r--r--net/ipv4/tcp_output.c143
-rw-r--r--net/ipv4/tcp_timer.c43
-rw-r--r--net/ipv4/tcp_ulp.c9
-rw-r--r--net/ipv4/udp.c91
-rw-r--r--net/ipv4/udp_offload.c106
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_protocol.c9
67 files changed, 2649 insertions, 854 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 974de4d20f25..f96bd489b362 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -180,8 +180,8 @@ config NET_IPIP
config NET_IPGRE_DEMUX
tristate "IP: GRE demultiplexer"
help
- This is helper module to demultiplex GRE packets on GRE version field criteria.
- Required by ip_gre and pptp modules.
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
config NET_IP_TUNNEL
tristate
@@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD
If unsure, say N.
+config INET_ESPINTCP
+ bool "IP: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv4 sockets.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
select INET_XFRM_TUNNEL
@@ -459,200 +470,200 @@ config TCP_CONG_BIC
tristate "Binary Increase Congestion (BIC) control"
default m
---help---
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
---help---
- This is version 2.0 of BIC-TCP which uses a cubic growth function
- among other techniques.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
---help---
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
config TCP_CONG_HTCP
- tristate "H-TCP"
- default m
+ tristate "H-TCP"
+ default m
---help---
- H-TCP is a send-side only modifications of the TCP Reno
- protocol stack that optimizes the performance of TCP
- congestion control for high speed network links. It uses a
- modeswitch to change the alpha and beta parameters of TCP Reno
- based on network conditions and in a way so as to be fair with
- other Reno and H-TCP flows.
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
config TCP_CONG_HSTCP
tristate "High Speed TCP"
default n
---help---
- Sally Floyd's High Speed TCP (RFC 3649) congestion control.
- A modification to TCP's congestion control mechanism for use
- with large congestion windows. A table indicates how much to
- increase the congestion window by when an ACK is received.
- For more detail see http://www.icir.org/floyd/hstcp.html
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
default n
---help---
- TCP-Hybla is a sender-side only change that eliminates penalization of
- long-RTT, large-bandwidth connections, like when satellite legs are
- involved, especially when sharing a common bottleneck with normal
- terrestrial connections.
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
config TCP_CONG_VEGAS
tristate "TCP Vegas"
default n
---help---
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
config TCP_CONG_NV
- tristate "TCP NV"
- default n
- ---help---
- TCP NV is a follow up to TCP Vegas. It has been modified to deal with
- 10G networks, measurement noise introduced by LRO, GRO and interrupt
- coalescence. In addition, it will decrease its cwnd multiplicatively
- instead of linearly.
+ tristate "TCP NV"
+ default n
+ ---help---
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
- Note that in general congestion avoidance (cwnd decreased when # packets
- queued grows) cannot coexist with congestion control (cwnd decreased only
- when there is packet loss) due to fairness issues. One scenario when they
- can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
- For further details see http://www.brakmo.org/networking/tcp-nv/
+ For further details see http://www.brakmo.org/networking/tcp-nv/
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
---help---
- Scalable TCP is a sender-side only change to TCP which uses a
- MIMD congestion control algorithm which has some nice scaling
- properties, though is known to have fairness issues.
- See http://www.deneholme.net/tom/scalable/
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
config TCP_CONG_LP
tristate "TCP Low Priority"
default n
---help---
- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
- to utilize only the excess network bandwidth as compared to the
- ``fair share`` of bandwidth as targeted by TCP.
- See http://www-ece.rice.edu/networks/TCP-LP/
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
config TCP_CONG_VENO
tristate "TCP Veno"
default n
---help---
- TCP Veno is a sender-side only enhancement of TCP to obtain better
- throughput over wireless networks. TCP Veno makes use of state
- distinguishing to circumvent the difficult judgment of the packet loss
- type. TCP Veno cuts down less congestion window in response to random
- loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
select TCP_CONG_VEGAS
default n
---help---
- YeAH-TCP is a sender-side high-speed enabled TCP congestion control
- algorithm, which uses a mixed loss/delay approach to compute the
- congestion window. It's design goals target high efficiency,
- internal, RTT and Reno fairness, resilience to link loss while
- keeping network elements load as low as possible.
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
- For further details look here:
- http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
default n
---help---
- TCP-Illinois is a sender-side modification of TCP Reno for
- high speed long delay links. It uses round-trip-time to
- adjust the alpha and beta parameters to achieve a higher average
- throughput and maintain fairness.
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
- For further details see:
- http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
config TCP_CONG_DCTCP
tristate "DataCenter TCP (DCTCP)"
default n
---help---
- DCTCP leverages Explicit Congestion Notification (ECN) in the network to
- provide multi-bit feedback to the end hosts. It is designed to provide:
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
- - High burst tolerance (incast due to partition/aggregate),
- - Low latency (short flows, queries),
- - High throughput (continuous data updates, large file transfers) with
- commodity, shallow-buffered switches.
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
- All switches in the data center network running DCTCP must support
- ECN marking and be configured for marking when reaching defined switch
- buffer thresholds. The default ECN marking threshold heuristic for
- DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
- (~100KB) at 10Gbps, but might need further careful tweaking.
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
- For further details see:
- http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
config TCP_CONG_CDG
tristate "CAIA Delay-Gradient (CDG)"
default n
---help---
- CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
- the TCP sender in order to:
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
o Use the delay gradient as a congestion signal.
o Back off with an average probability that is independent of the RTT.
o Coexist with flows that use loss-based congestion control.
o Tolerate packet loss unrelated to congestion.
- For further details see:
- D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
- delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
config TCP_CONG_BBR
tristate "BBR TCP"
default n
---help---
- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
- maximize network utilization and minimize queues. It builds an explicit
- model of the the bottleneck delivery rate and path round-trip
- propagation delay. It tolerates packet loss and delay unrelated to
- congestion. It can operate over LAN, WAN, cellular, wifi, or cable
- modem links. It can coexist with flows that use loss-based congestion
- control, and can operate with shallow buffers, deep buffers,
- bufferbloat, policers, or AQM schemes that do not provide a delay
- signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d57ecfaf89d4..9d97bace13c8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
+
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2301ef872e..2fe295432c24 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
- sk->sk_max_ack_backlog = backlog;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -495,7 +495,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < inet_prot_sock(net) &&
+ if (snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net)
return 0;
}
-static __net_exit void inet_exit_net(struct net *net)
-{
-}
-
static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
- .exit = inet_exit_net,
};
static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
new file mode 100644
index 000000000000..574972bc7299
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/tcp.h>
+
+static u32 optional_ops[] = {
+ offsetof(struct tcp_congestion_ops, init),
+ offsetof(struct tcp_congestion_ops, release),
+ offsetof(struct tcp_congestion_ops, set_state),
+ offsetof(struct tcp_congestion_ops, cwnd_event),
+ offsetof(struct tcp_congestion_ops, in_ack_event),
+ offsetof(struct tcp_congestion_ops, pkts_acked),
+ offsetof(struct tcp_congestion_ops, min_tso_segs),
+ offsetof(struct tcp_congestion_ops, sndbuf_expand),
+ offsetof(struct tcp_congestion_ops, cong_control),
+};
+
+static u32 unsupported_ops[] = {
+ offsetof(struct tcp_congestion_ops, get_info),
+};
+
+static const struct btf_type *tcp_sock_type;
+static u32 tcp_sock_id, sock_id;
+
+static int bpf_tcp_ca_init(struct btf *btf)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ sock_id = type_id;
+
+ type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_sock_id = type_id;
+ tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+
+ return 0;
+}
+
+static bool is_optional(u32 member_offset)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
+ if (member_offset == optional_ops[i])
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_unsupported(u32 member_offset)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
+ if (member_offset == unsupported_ops[i])
+ return true;
+ }
+
+ return false;
+}
+
+extern struct btf *btf_vmlinux;
+
+static bool bpf_tcp_ca_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ if (!btf_ctx_access(off, size, type, prog, info))
+ return false;
+
+ if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id)
+ /* promote it to tcp_sock */
+ info->btf_id = tcp_sock_id;
+
+ return true;
+}
+
+static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id)
+{
+ size_t end;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, t, off, size, atype, next_btf_id);
+
+ if (t != tcp_sock_type) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
+ end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
+ break;
+ case offsetof(struct inet_connection_sock, icsk_ack.pending):
+ end = offsetofend(struct inet_connection_sock,
+ icsk_ack.pending);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd):
+ end = offsetofend(struct tcp_sock, snd_cwnd);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_cnt):
+ end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
+ break;
+ case offsetof(struct tcp_sock, snd_ssthresh):
+ end = offsetofend(struct tcp_sock, snd_ssthresh);
+ break;
+ case offsetof(struct tcp_sock, ecn_flags):
+ end = offsetofend(struct tcp_sock, ecn_flags);
+ break;
+ default:
+ bpf_log(log, "no write support to tcp_sock at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return NOT_INIT;
+}
+
+BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
+{
+ /* bpf_tcp_ca prog cannot have NULL tp */
+ __tcp_send_ack((struct sock *)tp, rcv_nxt);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
+ .func = bpf_tcp_send_ack,
+ .gpl_only = false,
+ /* In case we want to report error later */
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_ANYTHING,
+ .btf_id = &tcp_sock_id,
+};
+
+static const struct bpf_func_proto *
+bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_tcp_send_ack:
+ return &bpf_tcp_send_ack_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
+ .get_func_proto = bpf_tcp_ca_get_func_proto,
+ .is_valid_access = bpf_tcp_ca_is_valid_access,
+ .btf_struct_access = bpf_tcp_ca_btf_struct_access,
+};
+
+static int bpf_tcp_ca_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct tcp_congestion_ops *utcp_ca;
+ struct tcp_congestion_ops *tcp_ca;
+ size_t tcp_ca_name_len;
+ int prog_fd;
+ u32 moff;
+
+ utcp_ca = (const struct tcp_congestion_ops *)udata;
+ tcp_ca = (struct tcp_congestion_ops *)kdata;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct tcp_congestion_ops, flags):
+ if (utcp_ca->flags & ~TCP_CONG_MASK)
+ return -EINVAL;
+ tcp_ca->flags = utcp_ca->flags;
+ return 1;
+ case offsetof(struct tcp_congestion_ops, name):
+ tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
+ if (!tcp_ca_name_len ||
+ tcp_ca_name_len == sizeof(utcp_ca->name))
+ return -EINVAL;
+ if (tcp_ca_find(utcp_ca->name))
+ return -EEXIST;
+ memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
+ return 1;
+ }
+
+ if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
+ return 0;
+
+ /* Ensure bpf_prog is provided for compulsory func ptr */
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ if (!prog_fd && !is_optional(moff) && !is_unsupported(moff))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_tcp_ca_check_member(const struct btf_type *t,
+ const struct btf_member *member)
+{
+ if (is_unsupported(btf_member_bit_offset(t, member) / 8))
+ return -ENOTSUPP;
+ return 0;
+}
+
+static int bpf_tcp_ca_reg(void *kdata)
+{
+ return tcp_register_congestion_control(kdata);
+}
+
+static void bpf_tcp_ca_unreg(void *kdata)
+{
+ tcp_unregister_congestion_control(kdata);
+}
+
+/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
+struct bpf_struct_ops bpf_tcp_congestion_ops = {
+ .verifier_ops = &bpf_tcp_ca_verifier_ops,
+ .reg = bpf_tcp_ca_reg,
+ .unreg = bpf_tcp_ca_unreg,
+ .check_member = bpf_tcp_ca_check_member,
+ .init_member = bpf_tcp_ca_init_member,
+ .init = bpf_tcp_ca_init,
+ .name = "tcp_congestion_ops",
+};
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 7bd29e694603..4a8550c49202 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -15,6 +15,7 @@
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -69,9 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
+ reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = jiffies;
+ inet->inet_id = prandom_u32();
sk_dst_set(sk, &rt->dst);
err = 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a4b5bd4d2c89..e4632bd2026d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1496,11 +1496,6 @@ skip:
}
}
-static bool inetdev_valid_mtu(unsigned int mtu)
-{
- return mtu >= IPV4_MIN_MTU;
-}
-
static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 5c967764041f..103c7d599a3c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
#include <linux/highmem.h>
@@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
put_page(sg_page(sg));
}
+#ifdef CONFIG_INET_ESPINTCP
+struct esp_tcp_sk {
+ struct sock *sk;
+ struct rcu_head rcu;
+};
+
+static void esp_free_tcp_sk(struct rcu_head *head)
+{
+ struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
+
+ sock_put(esk->sk);
+ kfree(esk);
+}
+
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct esp_tcp_sk *esk;
+ __be16 sport, dport;
+ struct sock *nsk;
+ struct sock *sk;
+
+ sk = rcu_dereference(x->encap_sk);
+ if (sk && sk->sk_state == TCP_ESTABLISHED)
+ return sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (sk && sk == nsk) {
+ esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
+ if (!esk) {
+ spin_unlock_bh(&x->lock);
+ return ERR_PTR(-ENOMEM);
+ }
+ RCU_INIT_POINTER(x->encap_sk, NULL);
+ esk->sk = sk;
+ call_rcu(&esk->rcu, esp_free_tcp_sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
+ dport, x->props.saddr.a4, sport, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ spin_lock_bh(&x->lock);
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (encap->encap_sport != sport ||
+ encap->encap_dport != dport) {
+ sock_put(sk);
+ sk = nsk ?: ERR_PTR(-EREMCHG);
+ } else if (sk == nsk) {
+ sock_put(sk);
+ } else {
+ rcu_assign_pointer(x->encap_sk, sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ sk = esp_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err)
+ goto out;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+
+ return -EOPNOTSUPP;
+}
+#endif
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
secpath_reset(skb);
xfrm_dev_resume(skb);
} else {
- xfrm_output_resume(skb, err);
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb, err);
}
}
@@ -225,45 +357,100 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
tail[plen - 1] = proto;
}
-static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
{
- int encap_type;
struct udphdr *uh;
__be32 *udpdata32;
- __be16 sport, dport;
- struct xfrm_encap_tmpl *encap = x->encap;
- struct ip_esp_hdr *esph = esp->esph;
unsigned int len;
- spin_lock_bh(&x->lock);
- sport = encap->encap_sport;
- dport = encap->encap_dport;
- encap_type = encap->encap_type;
- spin_unlock_bh(&x->lock);
-
len = skb->len + esp->tailen - skb_transport_offset(skb);
- if (len + sizeof(struct iphdr) >= IP_MAX_MTU)
- return -EMSGSIZE;
+ if (len + sizeof(struct iphdr) > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
- uh = (struct udphdr *)esph;
+ uh = (struct udphdr *)esp->esph;
uh->source = sport;
uh->dest = dport;
uh->len = htons(len);
uh->check = 0;
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ return (struct ip_esp_hdr *)(udpdata32 + 2);
+ }
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
switch (encap_type) {
default:
case UDP_ENCAP_ESPINUDP:
- esph = (struct ip_esp_hdr *)(uh + 1);
- break;
case UDP_ENCAP_ESPINUDP_NON_IKE:
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp_output_tcp_encap(x, skb, esp);
break;
}
- *skb_mac_header(skb) = IPPROTO_UDP;
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
esp->esph = esph;
return 0;
@@ -279,9 +466,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
struct sk_buff *trailer;
int tailen = esp->tailen;
- /* this is non-NULL only with UDP Encapsulation */
+ /* this is non-NULL only with TCP/UDP Encapsulation */
if (x->encap) {
- int err = esp_output_udp_encap(x, skb, esp);
+ int err = esp_output_encap(x, skb, esp);
if (err < 0)
return err;
@@ -474,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
if (sg != dsg)
esp_ssg_unref(x, tmp);
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
+
error_free:
kfree(tmp);
error:
@@ -600,7 +790,23 @@ int esp_input_done2(struct sk_buff *skb, int err)
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
+ struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+ __be16 source;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
/*
* 1) if the NAT-T peer's IP or port changed then
@@ -609,11 +815,11 @@ int esp_input_done2(struct sk_buff *skb, int err)
* SRC ports.
*/
if (iph->saddr != x->props.saddr.a4 ||
- uh->source != encap->encap_sport) {
+ source != encap->encap_sport) {
xfrm_address_t ipaddr;
ipaddr.a4 = iph->saddr;
- km_new_mapping(x, &ipaddr, uh->source);
+ km_new_mapping(x, &ipaddr, source);
/* XXX: perhaps add an extra
* policy check here, to see
@@ -988,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x)
case UDP_ENCAP_ESPINUDP_NON_IKE:
x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
break;
+#ifdef CONFIG_INET_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
+ break;
+#endif
}
}
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 0e4a7cf6bc87..e2e219c7854a 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -57,6 +57,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
if (!x)
goto out_reset;
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
sp->xvec[sp->len++] = x;
sp->olen++;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e8bc939b56dd..577db1d50a24 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -70,11 +70,6 @@ fail:
fib_free_table(main_table);
return -ENOMEM;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return false;
-}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -124,17 +119,13 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
return NULL;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return net->ipv4.fib_has_custom_rules;
-}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
static void fib_replace_table(struct net *net, struct fib_table *old,
@@ -1147,7 +1138,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
if (!(dev->flags & IFF_UP) ||
ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
ipv4_is_zeronet(prefix) ||
- prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+ (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
return;
/* add the new */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index a68b5e21ec51..c092e9a55790 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -16,6 +16,9 @@ struct fib_alias {
u8 fa_slen;
u32 tb_id;
s16 fa_default;
+ u8 offload:1,
+ trap:1,
+ unused:6;
struct rcu_head rcu;
};
@@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
- u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index b804ccbdb241..0c28bd469a68 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -9,12 +9,12 @@
#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
-int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+int call_fib4_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
info->family = AF_INET;
- return call_fib_notifier(nb, net, event_type, info);
+ return call_fib_notifier(nb, event_type, info);
}
int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
@@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net)
return net->ipv4.fib_seq + fib4_rules_seq_read(net);
}
-static int fib4_dump(struct net *net, struct notifier_block *nb)
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
int err;
- err = fib4_rules_dump(net, nb);
+ err = fib4_rules_dump(net, nb, extack);
if (err)
return err;
- fib_notify(net, nb);
-
- return 0;
+ return fib_notify(net, nb, extack);
}
static const struct fib_notifier_ops fib4_notifier_ops_template = {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b43a7ba5c6a4..f99e3bac5cab 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib4_rule_default);
-int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, AF_INET);
+ return fib_rules_dump(net, nb, AF_INET, extack);
}
unsigned int fib4_rules_seq_read(struct net *net)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2db089e10ba0..a803cdd9400a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
+ struct fib_rt_info fri;
struct sk_buff *skb;
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
@@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
if (!skb)
goto errout;
- err = fib_dump_info(skb, info->portid, seq, event, tb_id,
- fa->fa_type, key, dst_len,
- fa->fa_tos, fa->fa_info, nlm_flags);
+ fri.fi = fa->fa_info;
+ fri.tb_id = tb_id;
+ fri.dst = key;
+ fri.dst_len = dst_len;
+ fri.tos = fa->fa_tos;
+ fri.type = fa->fa_type;
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
+ err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -1582,7 +1589,7 @@ failure:
}
int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
- unsigned char *flags, bool skip_oif)
+ u8 rt_family, unsigned char *flags, bool skip_oif)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
*flags |= RTNH_F_DEAD;
@@ -1613,7 +1620,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
/* if gateway family does not match nexthop family
* gateway is encoded as RTA_VIA
*/
- if (nhc->nhc_gw_family != nhc->nhc_family) {
+ if (rt_family != nhc->nhc_gw_family) {
int alen = sizeof(struct in6_addr);
struct nlattr *nla;
struct rtvia *via;
@@ -1654,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info);
#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
- int nh_weight)
+ int nh_weight, u8 rt_family)
{
const struct net_device *dev = nhc->nhc_dev;
struct rtnexthop *rtnh;
@@ -1667,7 +1674,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
rtnh->rtnh_hops = nh_weight - 1;
rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
- if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
+ if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
goto nla_put_failure;
rtnh->rtnh_flags = flags;
@@ -1693,13 +1700,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
goto nla_put_failure;
if (unlikely(fi->nh)) {
- if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+ if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
goto nla_put_failure;
goto mp_end;
}
for_nexthops(fi) {
- if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET) < 0)
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid &&
@@ -1724,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
#endif
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
- u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
- struct fib_info *fi, unsigned int flags)
+ struct fib_rt_info *fri, unsigned int flags)
{
- unsigned int nhs = fib_info_num_path(fi);
+ unsigned int nhs = fib_info_num_path(fri->fi);
+ struct fib_info *fi = fri->fi;
+ u32 tb_id = fri->tb_id;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
@@ -1737,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET;
- rtm->rtm_dst_len = dst_len;
+ rtm->rtm_dst_len = fri->dst_len;
rtm->rtm_src_len = 0;
- rtm->rtm_tos = tos;
+ rtm->rtm_tos = fri->tos;
if (tb_id < 256)
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, tb_id))
goto nla_put_failure;
- rtm->rtm_type = type;
+ rtm->rtm_type = fri->type;
rtm->rtm_flags = fi->fib_flags;
rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
if (rtm->rtm_dst_len &&
- nla_put_in_addr(skb, RTA_DST, dst))
+ nla_put_in_addr(skb, RTA_DST, fri->dst))
goto nla_put_failure;
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
@@ -1775,7 +1784,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
unsigned char flags = 0;
- if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
+ if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
goto nla_put_failure;
rtm->rtm_flags = flags;
@@ -1794,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
}
+ if (fri->offload)
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (fri->trap)
+ rtm->rtm_flags |= RTM_F_TRAP;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1813,8 +1827,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
struct net *net = dev_net(dev);
- int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b2b3d291ab0..ff0c24371e33 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -74,11 +74,13 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type, u32 dst,
- int dst_len, struct fib_alias *fa)
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
+ .info.extack = extack,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
@@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
- return call_fib4_notifier(nb, net, event_type, &info.info);
+ return call_fib4_notifier(nb, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -978,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t,
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
+ * If 'find_first' is set, return the first matching
+ * fib alias, regardless of TOS and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id)
+ u8 tos, u32 prio, u32 tb_id,
+ bool find_first)
{
struct fib_alias *fa;
@@ -996,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
continue;
if (fa->tb_id != tb_id)
break;
+ if (find_first)
+ return fa;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1005,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
}
+static struct fib_alias *
+fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
+{
+ u8 slen = KEYLENGTH - fri->dst_len;
+ struct key_vector *l, *tp;
+ struct fib_table *tb;
+ struct fib_alias *fa;
+ struct trie *t;
+
+ tb = fib_get_table(net, fri->tb_id);
+ if (!tb)
+ return NULL;
+
+ t = (struct trie *)tb->tb_data;
+ l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
+ if (!l)
+ return NULL;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
+ fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
+ fa->fa_type == fri->type)
+ return fa;
+ }
+
+ return NULL;
+}
+
+void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
+{
+ struct fib_alias *fa_match;
+
+ rcu_read_lock();
+
+ fa_match = fib_find_matching_alias(net, fri);
+ if (!fa_match)
+ goto out;
+
+ fa_match->offload = fri->offload;
+ fa_match->trap = fri->trap;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
+
static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
while (!IS_TRIE(tn))
@@ -1061,9 +1114,6 @@ noleaf:
return -ENOMEM;
}
-/* fib notifier for ADD is sent before calling fib_insert_alias with
- * the expectation that the only possible failure ENOMEM
- */
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
@@ -1116,11 +1166,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
return true;
}
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old);
+
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
- enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1147,7 +1199,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
l = fib_find_node(t, &tp, key);
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
- tb->tb_id) : NULL;
+ tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
@@ -1214,19 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
- err = call_fib_entry_notifiers(net,
- FIB_EVENT_ENTRY_REPLACE,
- key, plen, new_fa,
- extack);
- if (err)
- goto out_free_new_fa;
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
+ tb->tb_id, true) == new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event,
+ key, plen,
+ new_fa, extack);
+ if (err) {
+ hlist_replace_rcu(&new_fa->fa_list,
+ &fa->fa_list);
+ goto out_free_new_fa;
+ }
+ }
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
- hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1242,12 +1304,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND) {
- event = FIB_EVENT_ENTRY_APPEND;
+ if (cfg->fc_nlflags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
- } else {
+ else
fa = fa_first;
- }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1266,15 +1326,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
-
- err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
- if (err)
- goto out_free_new_fa;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_fib_notif;
+ goto out_free_new_fa;
+
+ /* The alias was already inserted, so the node must exist. */
+ l = l ? l : fib_find_node(t, &tp, key);
+ if (WARN_ON_ONCE(!l))
+ goto out_free_new_fa;
+
+ if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
+ new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event, key, plen,
+ new_fa, extack);
+ if (err)
+ goto out_remove_new_fa;
+ }
if (!plen)
tb->tb_num_default++;
@@ -1285,14 +1359,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
succeeded:
return 0;
-out_fib_notif:
- /* notifier was sent that entry would be added to trie, but
- * the add failed and need to recover. Only failure for
- * fib_insert_alias is ENOMEM.
- */
- NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
- plen, new_fa, NULL);
+out_remove_new_fa:
+ fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1543,6 +1611,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
node_pull_suffix(tp, fa->fa_slen);
}
+static void fib_notify_alias_delete(struct net *net, u32 key,
+ struct hlist_head *fah,
+ struct fib_alias *fa_to_delete,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_alias *fa_next, *fa_to_notify;
+ u32 tb_id = fa_to_delete->tb_id;
+ u8 slen = fa_to_delete->fa_slen;
+ enum fib_event_type fib_event;
+
+ /* Do not notify if we do not care about the route. */
+ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
+ return;
+
+ /* Determine if the route should be replaced by the next route in the
+ * list.
+ */
+ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
+ struct fib_alias, fa_list);
+ if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ fa_to_notify = fa_next;
+ } else {
+ fib_event = FIB_EVENT_ENTRY_DEL;
+ fa_to_notify = fa_to_delete;
+ }
+ call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
+ fa_to_notify, extack);
+}
+
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
@@ -1564,7 +1662,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
@@ -1596,8 +1694,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!fa_to_delete)
return -ESRCH;
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete, extack);
+ fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1921,10 +2018,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
continue;
}
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
- n->key,
- KEYLENGTH - fa->fa_slen, fa,
- NULL);
+ fib_notify_alias_delete(net, n->key, &n->leaf, fa,
+ NULL);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -2015,10 +2110,13 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
}
}
-static void fib_leaf_notify(struct net *net, struct key_vector *l,
- struct fib_table *tb, struct notifier_block *nb)
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct fib_alias *fa;
+ int last_slen = -1;
+ int err;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
@@ -2032,39 +2130,57 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
if (tb->tb_id != fa->tb_id)
continue;
- call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
- KEYLENGTH - fa->fa_slen, fa);
+ if (fa->fa_slen == last_slen)
+ continue;
+
+ last_slen = fa->fa_slen;
+ err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
+ l->key, KEYLENGTH - fa->fa_slen,
+ fa, extack);
+ if (err)
+ return err;
}
+ return 0;
}
-static void fib_table_notify(struct net *net, struct fib_table *tb,
- struct notifier_block *nb)
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
+ int err;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- fib_leaf_notify(net, l, tb, nb);
+ err = fib_leaf_notify(l, tb, nb, extack);
+ if (err)
+ return err;
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
+ return 0;
}
-void fib_notify(struct net *net, struct notifier_block *nb)
+int fib_notify(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
unsigned int h;
+ int err;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb_hlist)
- fib_table_notify(net, tb, nb);
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ err = fib_table_notify(tb, nb, extack);
+ if (err)
+ return err;
+ }
}
+ return 0;
}
static void __trie_free_rcu(struct rcu_head *head)
@@ -2128,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
if (filter->dump_routes) {
if (!s_fa) {
+ struct fib_rt_info fri;
+
+ fri.fi = fi;
+ fri.tb_id = tb->tb_id;
+ fri.dst = xkey;
+ fri.dst_len = KEYLENGTH - fa->fa_slen;
+ fri.tos = fa->fa_tos;
+ fri.type = fa->fa_type;
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
err = fib_dump_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id, fa->fa_type,
- xkey,
- KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fi, flags);
+ RTM_NEWROUTE, &fri, flags);
if (err < 0)
goto stop;
}
@@ -2145,7 +2267,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
if (filter->dump_exceptions) {
err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
- &i_fa, s_fa);
+ &i_fa, s_fa, flags);
if (err < 0)
goto stop;
}
@@ -2175,6 +2297,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
int count = cb->args[2];
t_key key = cb->args[3];
+ /* First time here, count and key are both always 0. Count > 0
+ * and key == 0 means the dump has wrapped around and we are done.
+ */
+ if (count && !key)
+ return skb->len;
+
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 30fa771d382a..dcc79ff54b41 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -662,8 +662,8 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
- [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), },
- [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), },
+ [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), },
+ [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), },
[FOU_ATTR_PEER_PORT] = { .type = NLA_U16, },
[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
};
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 44bfeecac33e..5fd6e8ed02b5 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -127,7 +127,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
return -EINVAL;
- ershdr = (struct erspan_base_hdr *)options;
+ ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
tpi->key = cpu_to_be32(get_session_id(ershdr));
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 4de7e962d3da..2e6d1b7a7bc9 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
if (skb_gro_checksum_simple_validate(skb))
goto out_unlock;
- skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1510e951f451..f369e7ce685b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -249,10 +249,11 @@ bool icmp_global_allow(void)
bool rc = false;
/* Check if token bucket is empty and cannot be refilled
- * without taking the spinlock.
+ * without taking the spinlock. The READ_ONCE() are paired
+ * with the following WRITE_ONCE() in this same function.
*/
- if (!icmp_global.credit) {
- delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (!READ_ONCE(icmp_global.credit)) {
+ delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
if (delta < HZ / 50)
return false;
}
@@ -262,14 +263,14 @@ bool icmp_global_allow(void)
if (delta >= HZ / 50) {
incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
if (incr)
- icmp_global.stamp = now;
+ WRITE_ONCE(icmp_global.stamp, now);
}
credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
if (credit) {
credit--;
rc = true;
}
- icmp_global.credit = credit;
+ WRITE_ONCE(icmp_global.credit, credit);
spin_unlock(&icmp_global.lock);
return rc;
}
@@ -582,7 +583,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!rt)
goto out;
- net = dev_net(rt->dst.dev);
+
+ if (rt->dst.dev)
+ net = dev_net(rt->dst.dev);
+ else if (skb_in->dev)
+ net = dev_net(skb_in->dev);
+ else
+ goto out;
/*
* Find the original header. It is expected to be valid, of course.
@@ -676,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
if (dev)
- saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ saddr = inet_select_addr(dev, iph->saddr,
+ RT_SCOPE_LINK);
else
saddr = 0;
rcu_read_unlock();
@@ -740,6 +748,39 @@ out:;
}
EXPORT_SYMBOL(__icmp_send);
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ icmp_send(skb_in, type, code, info);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+ icmp_send(skb_in, type, code, info);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
@@ -902,7 +943,7 @@ static bool icmp_redirect(struct sk_buff *skb)
return false;
}
- icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
return true;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 180f6896b98b..3b9c7a2725a9 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
- __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+ __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);
@@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
}
}
-static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
return skb_checksum_simple_validate(skb);
}
@@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- __ip_mc_inc_group(in_dev, addr, mode);
+ ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
return err;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f5c163d4771b..a4db79b1b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
rcu_read_unlock();
return &rt->dst;
@@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
return &rt->dst;
@@ -610,12 +610,6 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-#if IS_ENABLED(CONFIG_IPV6)
-#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
-#else
-#define AF_INET_FAMILY(fam) true
-#endif
-
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
const int max_retries,
@@ -716,7 +710,7 @@ static void reqsk_timer_handler(struct timer_list *t)
* ones are about to clog our table.
*/
qlen = reqsk_queue_len(queue);
- if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
@@ -770,6 +764,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(newsk);
+
+ if (!icsk->icsk_ulp_ops)
+ return;
+
+ if (icsk->icsk_ulp_ops->clone)
+ icsk->icsk_ulp_ops->clone(req, newsk, priority);
+}
+
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
* @sk: the socket to clone
@@ -810,6 +816,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ inet_clone_ulp(req, newsk, priority);
+
security_inet_csk_clone(newsk, req);
}
return newsk;
@@ -906,7 +914,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
percpu_counter_inc(sk->sk_prot->orphan_count);
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
@@ -915,7 +923,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
- tcp_sk(child)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
}
inet_csk_destroy_sock(child);
}
@@ -934,7 +942,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
req->sk = child;
req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
- queue->rskq_accept_head = req;
+ WRITE_ONCE(queue->rskq_accept_head, req);
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
@@ -1086,7 +1094,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
if (!dst)
goto out;
}
- dst->ops->update_pmtu(dst, sk, NULL, mtu);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = __sk_dst_check(sk, 0);
if (!dst)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bbb005eb5218..f11e997e517b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
- .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
.idiag_fmem = sk->sk_forward_alloc,
.idiag_tmem = sk_wmem_alloc_get(sk),
};
@@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
- jiffies_to_msecs(sk->sk_timer.expires - jiffies);
+ jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
} else {
r->idiag_timer = 0;
r->idiag_expires = 0;
@@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_timer.expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
inet_diag_msg_common_fill(r, sk);
r->idiag_retrans = 0;
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = jiffies_to_msecs(tmo);
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
@@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
offsetof(struct sock, sk_cookie));
tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
@@ -914,11 +911,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock(&ilb->lock);
- sk_for_each(sk, &ilb->head) {
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 97824864e40d..2bbaaf0c7176 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score = sk->sk_family == PF_INET ? 2 : 1;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
return score;
@@ -516,10 +516,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ const struct hlist_nulls_node *node;
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
- sk_for_each_rcu(sk2, &ilb->head) {
+ sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
@@ -555,9 +556,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
}
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
- hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
+ __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
else
- hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ __sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
inet_hash2(hashinfo, sk);
ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
@@ -606,11 +607,9 @@ void inet_unhash(struct sock *sk)
reuseport_detach_sock(sk);
if (ilb) {
inet_unhash2(hashinfo, sk);
- __sk_del_node_init(sk);
- ilb->count--;
- } else {
- __sk_nulls_del_node_init_rcu(sk);
+ ilb->count--;
}
+ __sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
spin_unlock_bh(lock);
@@ -750,7 +749,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
+ i + LISTENING_NULLS_BASE);
h->listening_hash[i].count = 0;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index be778599bfed..ff327a62c9ce 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base,
base->total / inet_peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
- delta = (__u32)jiffies - p->dtime;
+
+ /* The READ_ONCE() pairs with the WRITE_ONCE()
+ * in inet_putpeer()
+ */
+ delta = (__u32)jiffies - READ_ONCE(p->dtime);
+
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
gc_stack[i] = NULL;
}
@@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
- p->dtime = (__u32)jiffies;
+ /* The WRITE_ONCE() pairs with itself (we run lockless)
+ * and the READ_ONCE() in inet_peer_gc()
+ */
+ WRITE_ONCE(p->dtime, (__u32)jiffies);
if (refcount_dec_and_test(&p->refcnt))
call_rcu(&p->rcu, inetpeer_free_rcu);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 06f6f280b9ff..00ec819f949b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb)
rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_gw_family)
+ if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index a53a543fe055..8274f98c511c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ const struct iphdr *tnl_params;
+
if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
raw_proto, false) < 0)
goto drop;
@@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
skb_pop_mac_header(skb);
else
skb_reset_mac_header(skb);
- if (tunnel->collect_md) {
+
+ tnl_params = &tunnel->parms.iph;
+ if (tunnel->collect_md || tnl_params->daddr == 0) {
__be16 flags;
__be64 tun_id;
@@ -509,9 +513,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
key = &tun_info->key;
if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
goto err_free_skb;
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
+ if (tun_info->options_len < sizeof(*md))
goto err_free_skb;
+ md = ip_tunnel_info_opts(tun_info);
/* ERSPAN has fixed 8 byte GRE header */
version = md->version;
@@ -1446,6 +1450,7 @@ static void erspan_setup(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
ether_setup(dev);
+ dev->max_mtu = 0;
dev->netdev_ops = &erspan_netdev_ops;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -1459,8 +1464,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
- [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_TOS] = { .type = NLA_U8 },
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1e2392b7c64e..aa438c6758a7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -199,7 +199,7 @@ resubmit:
kfree_skb(skb);
return;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
}
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
@@ -302,16 +302,31 @@ drop:
return true;
}
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+ ip_hdr(hint)->tos == iph->tos;
+}
+
INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb, struct net_device *dev)
+ struct sk_buff *skb, struct net_device *dev,
+ const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
+ if (ip_can_use_hint(skb, iph, hint)) {
+ err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
+ dev, hint);
+ if (unlikely(err))
+ goto drop_error;
+ }
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev);
+ ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
}
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+ struct sk_buff *skb, int rt_type)
+{
+ if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+ return NULL;
+
+ return skb;
+}
+
static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip_extract_route_hint(net, skb,
+ ((struct rtable *)dst)->rt_type);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
@@ -611,5 +638,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
- ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc7ef0d05bbd..d84819893db9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *skb, unsigned int mtu)
{
+ struct sk_buff *segs, *nskb;
netdev_features_t features;
- struct sk_buff *segs;
int ret = 0;
/* common case: seglen is <= mtu
@@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
consume_skb(skb);
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
int err;
skb_mark_not_on_list(segs);
@@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
if (err && ret == 0)
ret = err;
- segs = nskb;
- } while (segs);
+ }
return ret;
}
@@ -422,7 +420,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
@@ -430,7 +428,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
+ net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -499,7 +497,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
@@ -645,11 +643,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
EXPORT_SYMBOL(ip_fraglist_prepare);
void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
- unsigned int ll_rs, unsigned int mtu,
+ unsigned int ll_rs, unsigned int mtu, bool DF,
struct ip_frag_state *state)
{
struct iphdr *iph = ip_hdr(skb);
+ state->DF = DF;
state->hlen = hlen;
state->ll_rs = ll_rs;
state->mtu = mtu;
@@ -668,9 +667,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
- if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
- state->iph->frag_off |= htons(IP_DF);
-
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -738,6 +734,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
*/
iph = ip_hdr(skb2);
iph->frag_off = htons((state->offset >> 3));
+ if (state->DF)
+ iph->frag_off |= htons(IP_DF);
/*
* Added AC : If we are fragmenting a fragment that's not the
@@ -771,6 +769,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rtable *rt = skb_rtable(skb);
unsigned int mtu, hlen, ll_rs;
struct ip_fraglist_iter iter;
+ ktime_t tstamp = skb->tstamp;
struct ip_frag_state state;
int err = 0;
@@ -846,6 +845,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
ip_fraglist_prepare(skb, &iter);
}
+ skb->tstamp = tstamp;
err = output(net, sk, skb);
if (!err)
@@ -881,7 +881,8 @@ slow_path:
* Fragment the datagram.
*/
- ip_frag_init(skb, hlen, ll_rs, mtu, &state);
+ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+ &state);
/*
* Keep copying data until we run out.
@@ -900,6 +901,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
+ skb2->tstamp = tstamp;
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1254,18 +1256,22 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->addr = ipc->addr;
}
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+
+ if (!inetdev_valid_mtu(cork->fragsize))
+ return -ENETUNREACH;
cork->gso_size = ipc->gso_size;
+
cork->dst = &rt->dst;
+ /* We stole this route, caller should not release it. */
+ *rtp = NULL;
+
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
+ cork->mark = ipc->sockc.mark;
cork->priority = ipc->priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
@@ -1529,7 +1535,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = cork->mark;
skb->tstamp = cork->transmit_time;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
@@ -1693,7 +1699,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
inet_sk(sk)->tos = arg->tos;
- sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 38c02bb62e2c..74e1d964a615 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -505,7 +505,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
if (skb_valid_dst(skb))
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) &&
@@ -1236,10 +1236,8 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4;
iph->ihl = 5;
- if (tunnel->collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
+ if (tunnel->collect_md)
netif_keep_dst(dev);
- }
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 1452a97914a0..47f8b947eef1 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -34,6 +34,9 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
const struct ip_tunnel_encap_ops __rcu *
iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
if (!md || md->type != METADATA_IP_TUNNEL ||
md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
-
return NULL;
- res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
+ src = &md->u.tun_info;
+ res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
if (!res)
return NULL;
dst = &res->u.tun_info;
- src = &md->u.tun_info;
dst->key.tun_id = src->key.tun_id;
if (src->mode & IP_TUNNEL_INFO_IPV6)
memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
@@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
dst->key.tun_flags = src->key.tun_flags;
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+ ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
+ src->options_len, 0);
return res;
}
@@ -211,30 +215,243 @@ void ip_tunnel_get_stats64(struct net_device *dev,
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP_DST] = { .type = NLA_U32 },
[LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
[LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
[LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+ [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 },
};
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+ int data_len, err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
+ geneve_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ if (info) {
+ struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(attr);
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(attr);
+ info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
+ vxlan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+ return -EINVAL;
+
+ if (info) {
+ struct vxlan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+ md->gbp = nla_get_u32(attr);
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
+ erspan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+ return -EINVAL;
+
+ ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+ return -EINVAL;
+ } else if (ver == 2) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+ !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (info) {
+ struct erspan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ md->version = ver;
+ if (ver == 1) {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(attr);
+ } else {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(attr);
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(attr));
+ }
+
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, opts_len = 0, type = 0;
+ struct nlattr *nla;
+
+ if (!attr)
+ return 0;
+
+ err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+ ip_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case LWTUNNEL_IP_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return opts_len;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, info, extack);
+}
+
static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
ip_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -242,6 +459,12 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
#ifdef CONFIG_DST_CACHE
err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
if (err) {
@@ -266,10 +489,12 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
+ tun_info->key.tun_flags |=
+ (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
+ ~TUNNEL_OPTIONS_PRESENT);
tun_info->mode = IP_TUNNEL_INFO_TX;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
@@ -285,6 +510,114 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
#endif
}
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct geneve_opt *opt;
+ struct nlattr *nest;
+ int offset = 0;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+ if (!nest)
+ return -ENOMEM;
+
+ while (tun_info->options_len > offset) {
+ opt = ip_tunnel_info_opts(tun_info) + offset;
+ if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+ nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+ opt->opt_data)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, nest);
+ return 0;
+err:
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nlattr *nest;
+ int err = 0;
+
+ if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, type);
+ if (!nest)
+ return -ENOMEM;
+
+ if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+ err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+ else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+ err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+ else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+ err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
+
+ if (err) {
+ nla_nest_cancel(skb, nest);
+ return err;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
static int ip_tun_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -296,12 +629,52 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
return -ENOMEM;
return 0;
}
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+ int opt_len;
+
+ if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ return 0;
+
+ opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
+ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
+ while (info->options_len > offset) {
+ opt = ip_tunnel_info_opts(info) + offset;
+ opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
+ + nla_total_size(1) /* OPT_GENEVE_TYPE */
+ + nla_total_size(opt->length * 4);
+ /* OPT_GENEVE_DATA */
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ + nla_total_size(4); /* OPT_VXLAN_GBP */
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
+ + nla_total_size(1) /* OPT_ERSPAN_VER */
+ + (md->version == 1 ? nla_total_size(4)
+ /* OPT_ERSPAN_INDEX (v1) */
+ : nla_total_size(1) +
+ nla_total_size(1));
+ /* OPT_ERSPAN_DIR + HWID (v2) */
+ }
+
+ return opt_len;
+}
+
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
@@ -309,13 +682,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ nla_total_size(1) /* LWTUNNEL_IP_TTL */
- + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP_OPTS */
}
static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
- return memcmp(lwt_tun_info(a), lwt_tun_info(b),
- sizeof(struct ip_tunnel_info));
+ struct ip_tunnel_info *info_a = lwt_tun_info(a);
+ struct ip_tunnel_info *info_b = lwt_tun_info(b);
+
+ return memcmp(info_a, info_b, sizeof(info_a->key)) ||
+ info_a->mode != info_b->mode ||
+ info_a->options_len != info_b->options_len ||
+ memcmp(ip_tunnel_info_opts(info_a),
+ ip_tunnel_info_opts(info_b), info_a->options_len);
}
static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
@@ -328,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
[LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
[LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
static int ip6_tun_build_state(struct nlattr *attr,
@@ -341,17 +724,21 @@ static int ip6_tun_build_state(struct nlattr *attr,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
ip6_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -359,6 +746,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
if (tb[LWTUNNEL_IP6_ID])
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
@@ -375,10 +768,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ tun_info->key.tun_flags |=
+ (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
+ ~TUNNEL_OPTIONS_PRESENT);
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
@@ -396,7 +791,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
return -ENOMEM;
return 0;
@@ -409,7 +805,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ nla_total_size(1) /* LWTUNNEL_IP6_TC */
- + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP6_OPTS */
}
static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cfb025606793..37cddd18f282 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -187,8 +187,17 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
int mtu;
if (!dst) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
}
dst_hold(dst);
@@ -214,7 +223,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
mtu = dst_mtu(dst);
if (skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
@@ -580,8 +589,8 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
[IFLA_VTI_LINK] = { .type = NLA_U32 },
[IFLA_VTI_IKEY] = { .type = NLA_U32 },
[IFLA_VTI_OKEY] = { .type = NLA_U32 },
- [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_VTI_FWMARK] = { .type = NLA_U32 },
};
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9bcca08efec9..4438f6b12335 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1334,7 +1334,7 @@ static int __init ipconfig_proc_net_init(void)
/* Create a new file under /proc/net/ipconfig */
static int ipconfig_proc_net_create(const char *name,
- const struct file_operations *fops)
+ const struct proc_ops *proc_ops)
{
char *pname;
struct proc_dir_entry *p;
@@ -1346,7 +1346,7 @@ static int ipconfig_proc_net_create(const char *name,
if (!pname)
return -ENOMEM;
- p = proc_create(pname, 0444, init_net.proc_net, fops);
+ p = proc_create(pname, 0444, init_net.proc_net, proc_ops);
kfree(pname);
if (!p)
return -ENOMEM;
@@ -1355,7 +1355,7 @@ static int ipconfig_proc_net_create(const char *name,
}
/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
-static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+static int ntp_servers_show(struct seq_file *seq, void *v)
{
int i;
@@ -1365,7 +1365,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)
}
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq);
+DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers);
#endif /* CONFIG_PROC_FS */
/*
@@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void)
struct net_device *dev;
int found = 0;
+ /* make sure deferred device probes are finished */
+ wait_for_device_probe();
+
rtnl_lock();
for_each_netdev(&init_net, dev) {
if (ic_is_init_dev(dev)) {
@@ -1453,7 +1456,7 @@ static int __init ip_auto_config(void)
proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
if (ipconfig_proc_net_init() == 0)
- ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1483,10 +1486,10 @@ static int __init ip_auto_config(void)
* missing values.
*/
if (ic_myaddr == NONE ||
-#ifdef CONFIG_ROOT_NFS
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
(root_server_addr == NONE &&
ic_servaddr == NONE &&
- ROOT_DEV == Root_NFS) ||
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
@@ -1513,6 +1516,12 @@ static int __init ip_auto_config(void)
goto try_try_again;
}
#endif
+#ifdef CONFIG_CIFS_ROOT
+ if (ROOT_DEV == Root_CIFS) {
+ pr_err("IP-Config: Retrying forever (CIFS root)...\n");
+ goto try_try_again;
+ }
+#endif
if (--retries) {
pr_err("IP-Config: Reopening network devices...\n");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c07bc82cbbe9..6e68def66822 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
}
static unsigned int ipmr_rules_seq_read(struct net *net)
@@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -1134,8 +1136,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
if (!found) {
/* Create a new entry if allowable */
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ipmr_cache_alloc_unres()) == NULL) {
+ c = ipmr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -1794,7 +1796,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- nf_reset(skb);
+ nf_reset_ct(skb);
}
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
@@ -2140,7 +2142,7 @@ int ip_mr_input(struct sk_buff *skb)
mroute_sk = rcu_dereference(mrt->mroute_sk);
if (mroute_sk) {
- nf_reset(skb);
+ nf_reset_ct(skb);
raw_rcv(mroute_sk, skb);
return 0;
}
@@ -2289,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return -ENODEV;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
read_unlock(&mrt_lock);
rcu_read_unlock();
@@ -3040,10 +3043,11 @@ static unsigned int ipmr_seq_read(struct net *net)
return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
}
-static int ipmr_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
- ipmr_mr_table_iter, &mrt_lock);
+ ipmr_mr_table_iter, &mrt_lock, extack);
}
static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index ea48bd15a575..aa8738a91210 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute);
int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
int (*rules_dump)(struct net *net,
- struct notifier_block *nb),
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack),
struct mr_table *(*mr_iter)(struct net *net,
struct mr_table *mrt),
- rwlock_t *mrt_lock)
+ rwlock_t *mrt_lock,
+ struct netlink_ext_ack *extack)
{
struct mr_table *mrt;
int err;
- err = rules_dump(net, nb);
+ err = rules_dump(net, nb, extack);
if (err)
return err;
@@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
if (!v->dev)
continue;
- mr_call_vif_notifier(nb, net, family,
- FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ err = mr_call_vif_notifier(nb, family,
+ FIB_EVENT_VIF_ADD,
+ v, vifi, mrt->id, extack);
+ if (err)
+ break;
}
read_unlock(mrt_lock);
+ if (err)
+ return err;
+
/* Notify on table MFC entries */
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
- mr_call_mfc_notifier(nb, net, family,
- FIB_EVENT_ENTRY_ADD,
- mfc, mrt->id);
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ err = mr_call_mfc_notifier(nb, family,
+ FIB_EVENT_ENTRY_ADD,
+ mfc, mrt->id, extack);
+ if (err)
+ return err;
+ }
}
return 0;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 69e76d677f9e..f17b402111ce 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -272,7 +272,7 @@ config IP_NF_TARGET_CLUSTERIP
The CLUSTERIP target allows you to build load-balancing clusters of
network servers without having a dedicated load-balancing
router/server/switch.
-
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ECN
@@ -281,7 +281,7 @@ config IP_NF_TARGET_ECN
depends on NETFILTER_ADVANCED
---help---
This option adds a `ECN' target, which can be used in the iptables mangle
- table.
+ table.
You can use this target to remove the ECN bits from the IPv4 header of
an IP packet. This is particularly useful, if you need to work around
@@ -306,7 +306,7 @@ config IP_NF_RAW
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
@@ -318,7 +318,7 @@ config IP_NF_SECURITY
help
This option adds a `security' table to iptables, for use
with Mandatory Access Control (MAC) policy.
-
+
If unsure, say N.
endif # IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c50e0ec095d2..7c497c78105f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
# flow table support
obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
-# generic IP tables
+# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 214154b47d56..f1f78a742b36 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -384,10 +384,11 @@ next: ;
return 1;
}
-static inline int check_target(struct arpt_entry *e, const char *name)
+static int check_target(struct arpt_entry *e, struct net *net, const char *name)
{
struct xt_entry_target *t = arpt_get_target(e);
struct xt_tgchk_param par = {
+ .net = net,
.table = name,
.entryinfo = e,
.target = t->u.kernel.target,
@@ -399,8 +400,9 @@ static inline int check_target(struct arpt_entry *e, const char *name)
return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
-static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+static int
+find_check_entry(struct arpt_entry *e, struct net *net, const char *name,
+ unsigned int size,
struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
@@ -419,7 +421,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
}
t->u.kernel.target = target;
- ret = check_target(e, name);
+ ret = check_target(e, net, name);
if (ret)
goto err;
return 0;
@@ -494,12 +496,13 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
return 0;
}
-static inline void cleanup_entry(struct arpt_entry *e)
+static void cleanup_entry(struct arpt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
struct xt_entry_target *t;
t = arpt_get_target(e);
+ par.net = net;
par.target = t->u.kernel.target;
par.targinfo = t->data;
par.family = NFPROTO_ARP;
@@ -512,7 +515,9 @@ static inline void cleanup_entry(struct arpt_entry *e)
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
-static int translate_table(struct xt_table_info *newinfo, void *entry0,
+static int translate_table(struct net *net,
+ struct xt_table_info *newinfo,
+ void *entry0,
const struct arpt_replace *repl)
{
struct xt_percpu_counter_alloc_state alloc_state = { 0 };
@@ -569,7 +574,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size,
+ ret = find_check_entry(iter, net, repl->name, repl->size,
&alloc_state);
if (ret != 0)
break;
@@ -580,7 +585,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
xt_entry_foreach(iter, entry0, newinfo->size) {
if (i-- == 0)
break;
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
}
return ret;
}
@@ -923,7 +928,7 @@ static int __do_replace(struct net *net, const char *name,
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
@@ -974,7 +979,7 @@ static int do_replace(struct net *net, const void __user *user,
goto free_newinfo;
}
- ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -986,7 +991,7 @@ static int do_replace(struct net *net, const void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1149,7 +1154,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
}
}
-static int translate_compat_table(struct xt_table_info **pinfo,
+static int translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
void **pentry0,
const struct compat_arpt_replace *compatr)
{
@@ -1217,7 +1223,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
repl.num_counters = 0;
repl.counters = NULL;
repl.size = newinfo->size;
- ret = translate_table(newinfo, entry1, &repl);
+ ret = translate_table(net, newinfo, entry1, &repl);
if (ret)
goto free_newinfo;
@@ -1270,7 +1276,7 @@ static int compat_do_replace(struct net *net, void __user *user,
goto free_newinfo;
}
- ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp);
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -1282,7 +1288,7 @@ static int compat_do_replace(struct net *net, void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1509,7 +1515,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-static void __arpt_unregister_table(struct xt_table *table)
+static void __arpt_unregister_table(struct net *net, struct xt_table *table)
{
struct xt_table_info *private;
void *loc_cpu_entry;
@@ -1521,7 +1527,7 @@ static void __arpt_unregister_table(struct xt_table *table)
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
@@ -1546,7 +1552,7 @@ int arpt_register_table(struct net *net,
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(newinfo, loc_cpu_entry, repl);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
@@ -1561,7 +1567,7 @@ int arpt_register_table(struct net *net,
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
- __arpt_unregister_table(new_table);
+ __arpt_unregister_table(net, new_table);
*res = NULL;
}
@@ -1576,7 +1582,7 @@ void arpt_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __arpt_unregister_table(table);
+ __arpt_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6bdb1ab8af61..f8755a4ae9d4 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -58,7 +58,7 @@ struct clusterip_config {
};
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
+static const struct proc_ops clusterip_proc_ops;
#endif
struct clusterip_net {
@@ -280,7 +280,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
mutex_lock(&cn->mutex);
c->pde = proc_create_data(buffer, 0600,
cn->procdir,
- &clusterip_proc_fops, c);
+ &clusterip_proc_ops, c);
mutex_unlock(&cn->mutex);
if (!c->pde) {
err = -ENOMEM;
@@ -804,12 +804,12 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
return size;
}
-static const struct file_operations clusterip_proc_fops = {
- .open = clusterip_proc_open,
- .read = seq_read,
- .write = clusterip_proc_write,
- .llseek = seq_lseek,
- .release = clusterip_proc_release,
+static const struct proc_ops clusterip_proc_ops = {
+ .proc_open = clusterip_proc_open,
+ .proc_read = seq_read,
+ .proc_write = clusterip_proc_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = clusterip_proc_release,
};
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 0e70f3f65f6f..748dc3ce58d3 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
- opts.mss_encode = opts.mss;
- opts.mss = info->mss;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index af3fbf76dbd3..6cc5743c553a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
/*
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 012c4047c788..e32e41b99f0f 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -9,6 +9,8 @@
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
.init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 36a28d46149c..c94445b44d8c 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
if (icmph == NULL)
return 1;
- switch (icmph->type) {
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_REDIRECT:
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- break;
- default:
+ if (!icmp_is_err(icmph->type))
return 1;
- }
inside_iph = skb_header_pointer(skb, outside_hdrlen +
sizeof(struct icmphdr),
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5fe5a3981d43..d072c326dd64 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
- [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 },
[NHA_ID] = { .type = NLA_U32 },
[NHA_GROUP] = { .type = NLA_BINARY },
[NHA_GROUP_TYPE] = { .type = NLA_U16 },
@@ -322,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh)
static size_t nh_nlmsg_size(struct nexthop *nh)
{
- size_t sz = nla_total_size(4); /* NHA_ID */
+ size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
+
+ sz += nla_total_size(4); /* NHA_ID */
if (nh->is_group)
sz += nh_nlmsg_size_grp(nh);
@@ -1151,7 +1152,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
.fc_encap_type = cfg->nh_encap_type,
};
u32 tb_id = l3mdev_fib_table(cfg->dev);
- int err = -EINVAL;
+ int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
if (err) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9d24ef5c5d8f..535427292194 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
} else if (!ipc.oif)
ipc.oif = inet->uc_index;
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
sk->sk_uid);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index cc90243ccf76..2580303249e2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
+ SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
+ SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 40a6abbc9cf6..3183413ebc6c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
return NET_RX_DROP;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_push(skb, skb->data - skb_network_header(skb));
@@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = sockc->mark;
skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE,
hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 517300d587a7..ebe7060d0fc9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,7 +139,8 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu);
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void ipv4_dst_destroy(struct dst_entry *dst);
@@ -236,11 +237,11 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file)
return seq_open(file, &rt_cache_seq_ops);
}
-static const struct file_operations rt_cache_seq_fops = {
- .open = rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cache_proc_ops = {
+ .proc_open = rt_cache_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
@@ -270,6 +271,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return &per_cpu(rt_cache_stat, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -326,11 +328,11 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file)
return seq_open(file, &rt_cpu_seq_ops);
}
-static const struct file_operations rt_cpu_seq_fops = {
- .open = rt_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cpu_proc_ops = {
+ .proc_open = rt_cpu_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -364,12 +366,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
struct proc_dir_entry *pde;
pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_seq_fops);
+ &rt_cache_proc_ops);
if (!pde)
goto err1;
pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_seq_fops);
+ net->proc_net_stat, &rt_cpu_proc_ops);
if (!pde)
goto err2;
@@ -635,6 +637,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_uses_gateway = 1;
rt->rt_gw_family = AF_INET;
rt->rt_gw4 = fnhe->fnhe_gw;
}
@@ -915,16 +918,15 @@ void ip_rt_send_redirect(struct sk_buff *skb)
if (peer->rate_tokens == 0 ||
time_after(jiffies,
(peer->rate_last +
- (ip_rt_redirect_load << peer->rate_tokens)))) {
+ (ip_rt_redirect_load << peer->n_redirects)))) {
__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
peer->rate_last = jiffies;
- ++peer->rate_tokens;
++peer->n_redirects;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- peer->rate_tokens == ip_rt_redirect_number)
+ peer->n_redirects == ip_rt_redirect_number)
net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
&ip_hdr(skb)->saddr, inet_iif(skb),
&ip_hdr(skb)->daddr, &gw);
@@ -1043,7 +1045,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
}
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct rtable *rt = (struct rtable *) dst;
struct flowi4 fl4;
@@ -1313,7 +1316,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = READ_ONCE(dst->dev->mtu);
if (unlikely(ip_mtu_locked(dst))) {
- if (rt->rt_gw_family && mtu > 576)
+ if (rt->rt_uses_gateway && mtu > 576)
mtu = 576;
}
@@ -1482,7 +1485,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
if (orig) {
- dst_dev_put(&orig->dst);
+ rt_add_uncached_list(orig);
dst_release(&orig->dst);
}
} else {
@@ -1569,6 +1572,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct fib_nh_common *nhc = FIB_RES_NHC(*res);
if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+ rt->rt_uses_gateway = 1;
rt->rt_gw_family = nhc->nhc_gw_family;
/* only INET and INET6 are supported */
if (likely(nhc->nhc_gw_family == AF_INET))
@@ -1634,6 +1638,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
+ rt->rt_uses_gateway = 0;
rt->rt_gw_family = 0;
rt->rt_gw4 = 0;
INIT_LIST_HEAD(&rt->rt_uncached);
@@ -1892,10 +1897,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
if (!icmph)
goto out;
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_REDIRECT &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB)
+ if (!icmp_is_err(icmph->type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -2020,10 +2022,52 @@ static int ip_mkroute_input(struct sk_buff *skb,
return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ const struct sk_buff *hint)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = (struct rtable *)hint;
+ struct net *net = dev_net(dev);
+ int err = -EINVAL;
+ u32 tag = 0;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+
+ if (rt->rt_type != RTN_LOCAL)
+ goto skip_validate_source;
+
+ tos &= IPTOS_RT_MASK;
+ err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+ if (err < 0)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return 0;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return err;
+}
+
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
@@ -2468,14 +2512,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
int orig_oif = fl4->flowi4_oif;
unsigned int flags = 0;
struct rtable *rth;
- int err = -ENETUNREACH;
+ int err;
if (fl4->saddr) {
- rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4->saddr) ||
ipv4_is_lbcast(fl4->saddr) ||
- ipv4_is_zeronet(fl4->saddr))
+ ipv4_is_zeronet(fl4->saddr)) {
+ rth = ERR_PTR(-EINVAL);
goto out;
+ }
+
+ rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
It was wrong for two reasons:
@@ -2643,7 +2690,8 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
}
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
}
@@ -2694,6 +2742,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
rt->rt_gw_family = ort->rt_gw_family;
if (rt->rt_gw_family == AF_INET)
rt->rt_gw4 = ort->rt_gw4;
@@ -2728,7 +2777,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq)
+ struct sk_buff *skb, u32 portid, u32 seq,
+ unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2736,7 +2786,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
if (!nlh)
return -EMSGSIZE;
@@ -2777,21 +2827,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_gw_family == AF_INET &&
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
- goto nla_put_failure;
- } else if (rt->rt_gw_family == AF_INET6) {
- int alen = sizeof(struct in6_addr);
- struct nlattr *nla;
- struct rtvia *via;
-
- nla = nla_reserve(skb, RTA_VIA, alen + 2);
- if (!nla)
+ if (rt->rt_uses_gateway) {
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
goto nla_put_failure;
-
- via = nla_data(nla);
- via->rtvia_family = AF_INET6;
- memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
}
expires = rt->dst.expires;
@@ -2860,7 +2912,7 @@ nla_put_failure:
static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, u32 table_id,
struct fnhe_hash_bucket *bucket, int genid,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
int i;
@@ -2891,7 +2943,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
table_id, NULL, skb,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
+ cb->nlh->nlmsg_seq, flags);
if (err)
return err;
next:
@@ -2904,7 +2956,7 @@ next:
int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
u32 table_id, struct fib_info *fi,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
struct net *net = sock_net(cb->skb->sk);
int nhsel, genid = fnhe_genid(net);
@@ -2922,7 +2974,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
err = 0;
if (bucket)
err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
- genid, fa_index, fa_start);
+ genid, fa_index, fa_start,
+ flags);
rcu_read_unlock();
if (err)
return err;
@@ -3171,19 +3224,45 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb_reset_mac_header(skb);
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ struct fib_rt_info fri;
+
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
goto errout_rcu;
}
+ fri.fi = res.fi;
+ fri.tb_id = table_id;
+ fri.dst = res.prefix;
+ fri.dst_len = res.prefixlen;
+ fri.tos = fl4.flowi4_tos;
+ fri.type = rt->rt_type;
+ fri.offload = 0;
+ fri.trap = 0;
+ if (res.fa_head) {
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+ u8 slen = 32 - fri.dst_len;
+
+ if (fa->fa_slen == slen &&
+ fa->tb_id == fri.tb_id &&
+ fa->fa_tos == fri.tos &&
+ fa->fa_info == res.fi &&
+ fa->fa_type == fri.type) {
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
+ break;
+ }
+ }
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
- rt->rt_type, res.prefix, res.prefixlen,
- fl4.flowi4_tos, res.fi, 0);
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
} else {
err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
if (err < 0)
goto errout_rcu;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 535b69326f66..9a4f6b16c9bc 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
* Since subsequent timestamps use the normal tcp_time_stamp value, we
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
*/
-u64 cookie_init_timestamp(struct request_sock *req)
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
{
struct inet_request_sock *ireq;
- u32 ts, ts_now = tcp_time_stamp_raw();
+ u32 ts, ts_now = tcp_ns_to_ts(now);
u32 options = 0;
ireq = inet_rsk(req);
@@ -349,6 +349,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = 0;
treq->tfo_listener = false;
+
+ if (IS_ENABLED(CONFIG_MPTCP))
+ treq->is_mptcp = 0;
+
if (IS_ENABLED(CONFIG_SMC))
ireq->smc_ok = 0;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0b980e841927..9684af02e0a5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
user_key[i * 4 + 1],
user_key[i * 4 + 2],
user_key[i * 4 + 3]);
+
+ if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+ break;
+
if (i + 1 < n_keys)
off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
}
@@ -820,6 +824,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &tcp_min_snd_mss_max,
},
{
+ .procname = "tcp_mtu_probe_floor",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
@@ -1028,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = &two,
},
#endif
{
@@ -1180,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "tcp_no_ssthresh_metrics_save",
+ .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "tcp_moderate_rcvbuf",
.data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77b485d60b9d..eb2d80519f8e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,6 +271,7 @@
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -326,7 +327,7 @@ void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (tcp_memory_pressure)
+ if (READ_ONCE(tcp_memory_pressure))
return;
val = jiffies;
@@ -341,7 +342,7 @@ void tcp_leave_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (!tcp_memory_pressure)
+ if (!READ_ONCE(tcp_memory_pressure))
return;
val = xchg(&tcp_memory_pressure, 0);
if (val)
@@ -443,15 +444,13 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
- sk->sk_state = TCP_CLOSE;
-
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
+ WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
+ WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
sk_sockets_allocated_inc(sk);
sk->sk_route_forced_caps = NETIF_F_GSO;
@@ -477,7 +476,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
int target, struct sock *sk)
{
- return (tp->rcv_nxt - tp->copied_seq >= target) ||
+ return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) ||
(sk->sk_prot->stream_memory_read ?
sk->sk_prot->stream_memory_read(sk) : false);
}
@@ -543,10 +542,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Connected or passive Fast Open socket? */
if (state != TCP_SYN_SENT &&
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
- if (tp->urg_seq == tp->copied_seq &&
+ if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
!sock_flag(sk, SOCK_URGINLINE) &&
tp->urg_data)
target++;
@@ -584,7 +583,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR;
return mask;
@@ -607,7 +606,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ answ = tp->urg_data &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
if (sk->sk_state == TCP_LISTEN)
@@ -616,7 +616,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_una;
+ answ = READ_ONCE(tp->write_seq) - tp->snd_una;
break;
case SIOCOUTQNSD:
if (sk->sk_state == TCP_LISTEN)
@@ -625,7 +625,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_nxt;
+ answ = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
break;
default:
return -ENOIOCTLCMD;
@@ -657,7 +658,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->sacked = 0;
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
@@ -690,8 +691,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
-static void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle, int size_goal)
+void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -925,7 +926,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return max(size_goal, mss_now);
}
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
@@ -935,6 +936,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
+/* In some cases, both sendpage() and sendmsg() could have added
+ * an skb to the write queue, but failed adding payload on it.
+ * We need to remove it to consume less memory, but more
+ * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
+ * users.
+ */
+static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb && !skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -1016,10 +1033,10 @@ new_segment:
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
- sk->sk_wmem_queued += copy;
+ sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
- tp->write_seq += copy;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1064,12 +1081,12 @@ out:
return copied;
do_error:
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
if (copied)
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
@@ -1165,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
+ int process_backlog = 0;
bool zc = false;
long timeo;
@@ -1257,9 +1274,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
@@ -1267,7 +1285,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
- process_backlog = true;
+ process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -1344,7 +1362,7 @@ new_segment:
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
- tp->write_seq += copy;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1388,26 +1406,18 @@ out_nopush:
sock_zerocopy_put(uarg);
return copied + copied_syn;
+do_error:
+ skb = tcp_write_queue_tail(sk);
do_fault:
- if (!skb->len) {
- tcp_unlink_write_queue(skb, sk);
- /* It is the one place in all of TCP, except connection
- * reset, where we can be unlinking the send_head.
- */
- if (tcp_write_queue_empty(sk))
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- sk_wmem_free_skb(sk, skb);
- }
+ tcp_remove_empty_skb(sk, skb);
-do_error:
if (copied + copied_syn)
goto out;
out_err:
sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
@@ -1657,9 +1667,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_eat_skb(sk, skb);
if (!desc->count)
break;
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
}
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
@@ -1688,7 +1698,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
else
cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
val = min(val, cap);
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);
@@ -1698,7 +1708,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
val <<= 1;
if (val > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = val;
+ WRITE_ONCE(sk->sk_rcvbuf, val);
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
}
return 0;
@@ -1728,8 +1738,8 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
+ u32 length = 0, seq, offset, zap_len;
const skb_frag_t *frags = NULL;
- u32 length = 0, seq, offset;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp;
@@ -1756,17 +1766,19 @@ static int tcp_zerocopy_receive(struct sock *sk,
seq = tp->copied_seq;
inq = tcp_inq(sk);
zc->length = min_t(u32, zc->length, inq);
- zc->length &= ~(PAGE_SIZE - 1);
- if (zc->length) {
- zap_page_range(vma, address, zc->length);
+ zap_len = zc->length & ~(PAGE_SIZE - 1);
+ if (zap_len) {
+ zap_page_range(vma, address, zap_len);
zc->recv_skip_hint = 0;
} else {
- zc->recv_skip_hint = inq;
+ zc->recv_skip_hint = zc->length;
}
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
if (skb) {
+ if (zc->recv_skip_hint > 0)
+ break;
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
@@ -1779,18 +1791,18 @@ static int tcp_zerocopy_receive(struct sock *sk,
break;
frags = skb_shinfo(skb)->frags;
while (offset) {
- if (frags->size > offset)
+ if (skb_frag_size(frags) > offset)
goto out;
- offset -= frags->size;
+ offset -= skb_frag_size(frags);
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset) {
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
int remaining = zc->recv_skip_hint;
- while (remaining && (frags->size != PAGE_SIZE ||
- frags->page_offset)) {
- remaining -= frags->size;
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
+ skb_frag_off(frags))) {
+ remaining -= skb_frag_size(frags);
frags++;
}
zc->recv_skip_hint -= remaining;
@@ -1808,7 +1820,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
out:
up_read(&current->mm->mmap_sem);
if (length) {
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
@@ -1851,29 +1863,33 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
if (new_tstamp) {
- struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
-
+ struct __kernel_timespec kts = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
sizeof(kts), &kts);
} else {
- struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
-
+ struct __kernel_old_timespec ts_old = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
sizeof(ts_old), &ts_old);
}
} else {
if (new_tstamp) {
- struct __kernel_sock_timeval stv;
-
- stv.tv_sec = tss->ts[0].tv_sec;
- stv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ struct __kernel_sock_timeval stv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
sizeof(stv), &stv);
} else {
- struct __kernel_old_timeval tv;
-
- tv.tv_sec = tss->ts[0].tv_sec;
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ struct __kernel_old_timeval tv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
sizeof(tv), &tv);
}
@@ -1945,13 +1961,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
struct sk_buff *skb, *last;
u32 urg_hole = 0;
struct scm_timestamping_internal tss;
- bool has_tss = false;
- bool has_cmsg;
+ int cmsg_flags;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
- if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock);
@@ -1961,7 +1976,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN)
goto out;
- has_cmsg = tp->recvmsg_inq;
+ cmsg_flags = tp->recvmsg_inq ? 1 : 0;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2034,7 +2049,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
@@ -2106,7 +2121,7 @@ found_ok_skb:
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
urg_hole++;
offset++;
used--;
@@ -2128,7 +2143,7 @@ found_ok_skb:
}
}
- *seq += used;
+ WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
@@ -2144,8 +2159,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
- has_tss = true;
- has_cmsg = true;
+ cmsg_flags |= 2;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
@@ -2155,7 +2169,7 @@ skip_copy:
found_fin_ok:
/* Process the FIN. */
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
break;
@@ -2170,10 +2184,10 @@ found_fin_ok:
release_sock(sk);
- if (has_cmsg) {
- if (has_tss)
+ if (cmsg_flags) {
+ if (cmsg_flags & 2)
tcp_recv_timestamp(msg, sk, &tss);
- if (tp->recvmsg_inq) {
+ if (cmsg_flags & 1) {
inq = tcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
@@ -2476,7 +2490,10 @@ adjudge_to_death:
}
if (sk->sk_state == TCP_CLOSE) {
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
/* We could get here with a non-NULL req if the socket is
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
@@ -2508,6 +2525,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
{
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+ tcp_sk(sk)->highest_sack = NULL;
while (p) {
struct sk_buff *skb = rb_to_skb(p);
@@ -2548,6 +2566,7 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int old_state = sk->sk_state;
+ u32 seq;
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE);
@@ -2574,7 +2593,7 @@ int tcp_disconnect(struct sock *sk, int flags)
__kfree_skb(sk->sk_rx_skb_cache);
sk->sk_rx_skb_cache = NULL;
}
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
@@ -2590,21 +2609,25 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->srtt_us = 0;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rcv_rtt_last_tsecr = 0;
- tp->write_seq += tp->max_window + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
+
+ seq = tp->write_seq + tp->max_window + 2;
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+
icsk->icsk_backoff = 0;
- tp->snd_cwnd = 2;
icsk->icsk_probes_out = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
+ tp->delivered = 0;
tp->delivered_ce = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
+ tp->total_retrans = 0;
inet_csk_delack_init(sk);
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
* issue in __tcp_select_window()
@@ -2616,10 +2639,14 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->segs_in = 0;
+ tp->segs_out = 0;
tp->bytes_sent = 0;
tp->bytes_acked = 0;
tp->bytes_received = 0;
tp->bytes_retrans = 0;
+ tp->data_segs_in = 0;
+ tp->data_segs_out = 0;
tp->duplicate_sack[0].start_seq = 0;
tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
@@ -2640,11 +2667,13 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
inet->defer_connect = 0;
+ tp->fastopen_client_fail = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2918,9 +2947,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (sk->sk_state != TCP_CLOSE)
err = -EPERM;
else if (tp->repair_queue == TCP_SEND_QUEUE)
- tp->write_seq = val;
+ WRITE_ONCE(tp->write_seq, val);
else if (tp->repair_queue == TCP_RECV_QUEUE)
- tp->rcv_nxt = val;
+ WRITE_ONCE(tp->rcv_nxt, val);
else
err = -EINVAL;
break;
@@ -3203,8 +3232,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
return;
}
@@ -3282,6 +3311,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3311,6 +3343,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
0;
}
@@ -3365,6 +3398,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
return stats;
}
@@ -3784,8 +3818,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
sg_set_page(&sg, page, skb_frag_size(f),
@@ -3817,7 +3851,13 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
void tcp_done(struct sock *sk)
{
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ /* We might be called with a new socket, after
+ * inet_csk_prepare_forced_close() has been called
+ * so we can not use lockdep_sock_is_held(sk)
+ */
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
@@ -3916,7 +3956,7 @@ void __init tcp_init(void)
BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
+ sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
@@ -3990,4 +4030,5 @@ void __init tcp_init(void)
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
+ mptcp_init();
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 56be7d27f208..6c4d79baff26 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -306,7 +306,8 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
@@ -346,7 +347,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * bdp = bw * min_rtt * gain
+ * bdp = ceil(bw * min_rtt * gain)
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
@@ -370,7 +371,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us;
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
return bdp;
@@ -386,7 +389,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/
-static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
{
struct bbr *bbr = inet_csk_ca(sk);
@@ -397,7 +400,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
cwnd = (cwnd + 1) & ~1U;
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
cwnd += 2;
return cwnd;
@@ -409,7 +412,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
u32 inflight;
inflight = bbr_bdp(sk, bw, gain);
- inflight = bbr_quantization_budget(sk, inflight, gain);
+ inflight = bbr_quantization_budget(sk, inflight);
return inflight;
}
@@ -529,7 +532,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
* due to aggregation (of data and/or ACKs) visible in the ACK stream.
*/
target_cwnd += bbr_ack_aggregation_cwnd(sk);
- target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
@@ -776,8 +779,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
* bandwidth sample. Delivered is in packets and interval_us in uS and
* ratio will be <<1 for most connections. So delivered is first scaled.
*/
- bw = (u64)rs->delivered * BW_UNIT;
- do_div(bw, rs->interval_us);
+ bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
/* If this sample is application-limited, it is likely to have a very
* low delivered count that represents application behavior rather than
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a56e09cfb0e..8a01428f80c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -121,14 +121,14 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct sk_psock *psock;
int copied, ret;
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
lock_sock(sk);
msg_bytes_ready:
copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
@@ -139,7 +139,7 @@ msg_bytes_ready:
timeo = sock_rcvtimeo(sk, nonblock);
data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
if (data) {
- if (skb_queue_empty(&sk->sk_receive_queue))
+ if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
release_sock(sk);
sk_psock_put(sk, psock);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
- bool cork = false, enospc = msg->sg.start == msg->sg.end;
+ bool cork = false, enospc = sk_msg_full(msg);
struct sock *sk_redir;
u32 tosend, delta = 0;
int ret;
@@ -315,10 +315,7 @@ more_data:
*/
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
- if (msg->sg.size < delta)
- delta -= msg->sg.size;
- else
- delta = 0;
+ delta -= msg->sg.size;
}
if (msg->cork_bytes &&
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c445a81d144e..3172e31987be 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
-static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
@@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
- if (unlikely(!try_module_get(ca->owner)))
+ if (unlikely(!bpf_try_module_get(ca, ca->owner)))
ca = &tcp_reno;
icsk->icsk_ca_ops = ca;
rcu_read_unlock();
@@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
- module_put(icsk->icsk_ca_ops->owner);
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
@@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
ca = tcp_ca_find_autoload(net, name);
if (!ca) {
ret = -ENOENT;
- } else if (!try_module_get(ca->owner)) {
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
ret = -EBUSY;
} else {
prev = xchg(&net->ipv4.tcp_congestion_control, ca);
if (prev)
- module_put(prev->owner);
+ bpf_module_put(prev, prev->owner);
ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
@@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -360,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
} else if (!load) {
const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
- if (try_module_get(ca->owner)) {
+ if (bpf_try_module_get(ca, ca->owner)) {
if (reinit) {
tcp_reinit_congestion_control(sk, ca);
} else {
icsk->icsk_ca_ops = ca;
- module_put(old_ca->owner);
+ bpf_module_put(old_ca, old_ca->owner);
}
} else {
err = -EBUSY;
}
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
err = -EPERM;
- } else if (!try_module_get(ca->owner)) {
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
err = -EBUSY;
} else {
tcp_reinit_congestion_control(sk, ca);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 1b3d032a4df2..8f8eefd3a3ce 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -40,8 +40,8 @@
/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES 8
-#define HYSTART_DELAY_MIN (4U<<3)
-#define HYSTART_DELAY_MAX (16U<<3)
+#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
static int fast_convergence __read_mostly = 1;
@@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1;
static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
-static int hystart_ack_delta __read_mostly = 2;
+static int hystart_ack_delta_us __read_mostly = 2000;
static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
@@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
-module_param(hystart_ack_delta, int, 0644);
-MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+module_param(hystart_ack_delta_us, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
/* BIC TCP Parameters */
struct bictcp {
@@ -89,7 +89,7 @@ struct bictcp {
u32 bic_origin_point;/* origin point of bic function */
u32 bic_K; /* time to origin point
from the beginning of the current epoch */
- u32 delay_min; /* min delay (msec << 3) */
+ u32 delay_min; /* min delay (usec) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
@@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->found = 0;
}
-static inline u32 bictcp_clock(void)
+static inline u32 bictcp_clock_us(const struct sock *sk)
{
-#if HZ < 1000
- return ktime_to_ms(ktime_get_real());
-#else
- return jiffies_to_msecs(jiffies);
-#endif
+ return tcp_sk(sk)->tcp_mstamp;
}
static inline void bictcp_hystart_reset(struct sock *sk)
@@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- ca->round_start = ca->last_ack = bictcp_clock();
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
ca->end_seq = tp->snd_nxt;
- ca->curr_rtt = 0;
+ ca->curr_rtt = ~0U;
ca->sample_cnt = 0;
}
@@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
*/
t = (s32)(tcp_jiffies32 - ca->epoch_start);
- t += msecs_to_jiffies(ca->delay_min >> 3);
+ t += usecs_to_jiffies(ca->delay_min);
/* change the unit from HZ to bictcp_HZ */
t <<= BICTCP_HZ;
do_div(t, HZ);
@@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state)
}
}
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(struct sock *sk)
+{
+ unsigned long rate;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ if (!rate)
+ return 0;
+ return min_t(u64, USEC_PER_MSEC,
+ div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
+}
+
static void hystart_update(struct sock *sk, u32 delay)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
-
- if (ca->found & hystart_detect)
- return;
+ u32 threshold;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now = bictcp_clock();
+ u32 now = bictcp_clock_us(sk);
/* first detection parameter - ack-train detection */
- if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
ca->last_ack = now;
- if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
- ca->found |= HYSTART_ACK_TRAIN;
+
+ threshold = ca->delay_min + hystart_ack_delay(sk);
+
+ /* Hystart ack train triggers if we get ack past
+ * ca->delay_min/2.
+ * Pacing might have delayed packets up to RTT/2
+ * during slow start.
+ */
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ threshold >>= 1;
+
+ if ((s32)(now - ca->round_start) > threshold) {
+ ca->found = 1;
+ pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+ now - ca->round_start, threshold,
+ ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
@@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay)
if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
- if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
+ if (ca->curr_rtt > delay)
ca->curr_rtt = delay;
ca->sample_cnt++;
} else {
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
- ca->found |= HYSTART_DELAY;
+ ca->found = 1;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
@@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay)
}
}
-/* Track delayed acknowledgment ratio using sliding window
- * ratio = (15*ratio + sample) / 16
- */
static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
return;
- delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
+ delay = sample->rtt_us;
if (delay == 0)
delay = 1;
@@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tcp_in_slow_start(tp) &&
+ if (!ca->found && tcp_in_slow_start(tp) && hystart &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a3a386236d93..0d08f9e2d8d0 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -21,13 +21,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
struct tcp_info *info = _info;
if (inet_sk_state_load(sk) == TCP_LISTEN) {
- r->idiag_rqueue = sk->sk_ack_backlog;
- r->idiag_wqueue = sk->sk_max_ack_backlog;
+ r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
} else if (sk->sk_type == SOCK_STREAM) {
const struct tcp_sock *tp = tcp_sk(sk);
- r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
- r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+ r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
}
if (info)
tcp_get_info(sk, info);
@@ -81,13 +82,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
}
#endif
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+ const struct tcp_ulp_ops *ulp_ops)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+ if (err)
+ goto nla_failure;
+
+ if (ulp_ops->get_info)
+ err = ulp_ops->get_info(sk, skb);
+ if (err)
+ goto nla_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_failure:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int err = 0;
+
#ifdef CONFIG_TCP_MD5SIG
if (net_admin) {
struct tcp_md5sig_info *md5sig;
- int err = 0;
rcu_read_lock();
md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
@@ -99,11 +129,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
}
#endif
+ if (net_admin) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops)
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops);
+ if (err)
+ return err;
+ }
return 0;
}
static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
size_t size = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -124,6 +164,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
#endif
+ if (net_admin && sk_fullsock(sk)) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ size += nla_total_size(0) +
+ nla_total_size(TCP_ULP_NAME_MAX);
+ if (ulp_ops->get_info_size)
+ size += ulp_ops->get_info_size(sk);
+ }
+ }
return size;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 3fd451271a70..19ad9586c720 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
*/
tp = tcp_sk(child);
- tp->fastopen_rsk = req;
+ rcu_assign_pointer(tp->fastopen_rsk, req);
tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
@@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
cookie->len = -1;
return true;
}
- return cookie->len > 0;
+ if (cookie->len > 0)
+ return true;
+ tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+ return false;
}
/* This function checks if we want to defer sending SYN until the first
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c21e8a22fb3b..316ebdf8151d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,6 +79,7 @@
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -266,7 +267,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -359,7 +360,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
+ WRITE_ONCE(sk->sk_sndbuf,
+ min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -483,8 +485,9 @@ static void tcp_clamp_window(struct sock *sk)
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- net->ipv4.sysctl_tcp_rmem[2]);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min(atomic_read(&sk->sk_rmem_alloc),
+ net->ipv4.sysctl_tcp_rmem[2]));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -648,7 +651,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
rcvbuf = min_t(u64, rcvwin * rcvmem,
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = rcvbuf;
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
@@ -913,9 +916,10 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if (!tp->retransmit_skb_hint ||
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+ (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
@@ -1420,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
- if (!tcp_skb_can_collapse_to(prev))
+ if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
@@ -1725,8 +1729,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
/* Ignore very old stuff early */
- if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+ if (i == 0)
+ first_sack_index = -1;
continue;
+ }
used_sacks++;
}
@@ -2666,7 +2673,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
- if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) &&
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
tcp_try_undo_loss(sk, false))
return;
@@ -2990,7 +2997,7 @@ void tcp_rearm_rto(struct sock *sk)
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return;
if (!tp->packets_out) {
@@ -3158,6 +3165,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
+ tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3362,7 +3370,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
- tp->rcv_nxt = seq;
+ WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
@@ -3548,7 +3556,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
- if (unlikely(rexmit == 2)) {
+ if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
@@ -3782,6 +3790,49 @@ static void smc_parse_options(const struct tcphdr *th,
#endif
}
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3875,6 +3926,10 @@ void tcp_parse_options(const struct net *net,
*/
break;
#endif
+ case TCPOPT_MPTCP:
+ mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ break;
+
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
@@ -4216,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
sk_rethink_txhash(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+ }
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4375,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+ if (!mptcp_skb_can_collapse(to, from))
+ return false;
+
#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
@@ -4512,6 +4572,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4712,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
bool fragstolen;
int eaten;
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
+
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
@@ -4883,7 +4947,7 @@ restart:
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
@@ -4892,7 +4956,7 @@ restart:
break;
}
- if (n && n != tail &&
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
@@ -4925,6 +4989,7 @@ restart:
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4944,6 +5009,7 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
+ !mptcp_skb_can_collapse(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
#ifdef CONFIG_TLS_DEVICE
@@ -5312,7 +5378,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -5768,6 +5834,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data) {
if (__tcp_retransmit_skb(sk, data, 1))
break;
@@ -5838,8 +5908,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ inet_csk_reset_xmit_timer(sk,
+ ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
@@ -5888,7 +5964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5914,10 +5990,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
+ if (sk_is_mptcp(sk))
+ mptcp_rcv_synsent(sk);
+
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
smc_check_reset_syn(tp);
@@ -5991,8 +6070,8 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -6043,6 +6122,8 @@ reset_and_undo:
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
+ struct request_sock *req;
+
tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
@@ -6052,7 +6133,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
- reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ reqsk_fastopen_remove(sk, req, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
@@ -6127,7 +6210,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
- req = tp->fastopen_rsk;
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
@@ -6167,7 +6251,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -6274,8 +6358,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
break;
+ }
/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
@@ -6422,9 +6509,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -6444,7 +6529,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ proto, sk->sk_num, msg);
return want_cookie;
}
@@ -6466,6 +6551,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
}
}
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -6487,7 +6602,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
@@ -6503,6 +6618,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6525,6 +6643,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
af_ops->init_req(req, sk, skb);
+ if (IS_ENABLED(CONFIG_MPTCP) && want_cookie)
+ tcp_rsk(req)->is_mptcp = 0;
+
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d57641cb3477..df1166b76126 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
loopback = true;
} else
#endif
@@ -164,9 +162,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
* without appearing to create any others.
*/
if (likely(!tp->repair)) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
+ u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
+
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
}
@@ -253,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
if (likely(!tp->repair))
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
inet->inet_dport = usin->sin_port;
@@ -291,16 +291,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (likely(!tp->repair)) {
if (!tp->write_seq)
- tp->write_seq = secure_tcp_seq(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port);
+ WRITE_ONCE(tp->write_seq,
+ secure_tcp_seq(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port));
tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
inet->inet_saddr,
inet->inet_daddr);
}
- inet->inet_id = tp->write_seq ^ jiffies;
+ inet->inet_id = prandom_u32();
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -478,7 +479,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -700,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
} else if (hash_location) {
+ const union tcp_md5_addr *addr;
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -713,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb),
- tcp_v4_sdif(skb));
+ ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
if (!sk1)
goto out;
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = sdif ? dif : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
if (!key)
goto out;
@@ -771,6 +787,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
}
ip_send_unicast_reply(ctl_sk,
@@ -866,6 +884,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -900,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ const union tcp_md5_addr *addr;
+ int l3index;
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
@@ -911,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
- AF_INET),
+ tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
}
@@ -978,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
EXPORT_SYMBOL(tcp_md5_needed);
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
const union tcp_md5_addr *addr,
int family)
{
@@ -998,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
-
+ if (key->l3index && key->l3index != l3index)
+ continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
match = (key->addr.a4.s_addr & mask) ==
@@ -1022,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
- int family, u8 prefixlen)
+ int family, u8 prefixlen,
+ int l3index)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1041,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
+ if (key->l3index && key->l3index != l3index)
+ continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
return key;
@@ -1052,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
+ int l3index;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
- return tcp_md5_do_lookup(sk, addr, AF_INET);
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
- gfp_t gfp)
+ int family, u8 prefixlen, int l3index,
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (key) {
/* Pre-existing entry - just update that one. */
memcpy(key->key, newkey, newkeylen);
@@ -1100,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key->keylen = newkeylen;
key->family = family;
key->prefixlen = prefixlen;
+ key->l3index = l3index;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -1109,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
- u8 prefixlen)
+ u8 prefixlen, int l3index)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1144,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ const union tcp_md5_addr *addr;
u8 prefixlen = 32;
+ int l3index = 0;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1162,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
+
if (!cmd.tcpm_keylen)
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen);
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
- GFP_KERNEL);
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -1281,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
/* Called with rcu_read_lock() */
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ int dif, int sdif)
{
#ifdef CONFIG_TCP_MD5SIG
/*
@@ -1296,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
struct tcp_md5sig_key *hash_expected;
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
+ const union tcp_md5_addr *addr;
unsigned char newhash[16];
+ int genhash, l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
- AF_INET);
+ addr = (union tcp_md5_addr *)&iph->saddr;
+ hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
@@ -1326,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
&iph->saddr, ntohs(th->source),
&iph->daddr, ntohs(th->dest),
genhash ? " tcp_v4_calc_md5_hash failed"
- : "");
+ : "", l3index);
return true;
}
return false;
@@ -1367,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
@@ -1414,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
+ const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct ip_options_rcu *inet_opt;
@@ -1443,7 +1504,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = newtp->write_seq ^ jiffies;
+ newinet->inet_id = prandom_u32();
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
@@ -1462,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET);
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
if (key) {
/*
* We're using one, so create a matching key
@@ -1472,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
* memory, then we end up not copying the key
* across. Shucks.
*/
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
+ key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1515,6 +1577,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1625,7 +1702,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+ u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1788,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1836,7 +1914,7 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -1894,10 +1972,10 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- if (tcp_v4_inbound_md5_hash(sk, skb))
+ if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (tcp_filter(sk, skb))
goto discard_and_relse;
@@ -2102,7 +2180,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- BUG_ON(tp->fastopen_rsk);
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
@@ -2127,13 +2205,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
struct sock *sk = cur;
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock(&ilb->lock);
- sk = sk_head(&ilb->head);
+ sk = sk_nulls_head(&ilb->nulls_head);
st->offset = 0;
goto get_sk;
}
@@ -2141,9 +2220,9 @@ get_head:
++st->num;
++st->offset;
- sk = sk_next(sk);
+ sk = sk_nulls_next(sk);
get_sk:
- sk_for_each_from(sk) {
+ sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == afinfo->family)
@@ -2431,17 +2510,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
- rx_queue = sk->sk_ack_backlog;
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
- rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, state,
- tp->write_seq - tp->snd_una,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
@@ -2598,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
int cpu;
if (net->ipv4.tcp_congestion_control)
- module_put(net->ipv4.tcp_congestion_control->owner);
+ bpf_module_put(net->ipv4.tcp_congestion_control,
+ net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
@@ -2637,6 +2718,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
@@ -2652,12 +2734,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -2703,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net)
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+ init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c4848e7a0aad..279db8822439 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk)
if (tcp_in_initial_slowstart(tp)) {
/* Slow start still did not finish. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && (tp->snd_cwnd >> 1) > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk)
} else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
@@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
(val + tp->snd_ssthresh) >> 1);
}
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && tp->snd_ssthresh > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
@@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk)
if (tcp_metric_locked(tm, TCP_METRIC_CWND))
tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
- val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ?
+ 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val) {
tp->snd_ssthresh = val;
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8bcaf2586b68..ad3b56d9fa71 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
+ tw->tw_priority = sk->sk_priority;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -413,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
ca_got_dst = true;
@@ -424,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
/* If no valid choice made yet, assign current system default ca. */
if (!ca_got_dst &&
(!icsk->icsk_ca_setsockopt ||
- !try_module_get(icsk->icsk_ca_ops->owner)))
+ !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -461,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk;
struct tcp_sock *oldtp, *newtp;
+ u32 seq;
if (!newsk)
return NULL;
@@ -474,12 +476,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
/* Now setup tcp_sock */
newtp->pred_flags = 0;
- newtp->rcv_wup = newtp->copied_seq =
- newtp->rcv_nxt = treq->rcv_isn + 1;
+ seq = treq->rcv_isn + 1;
+ newtp->rcv_wup = seq;
+ WRITE_ONCE(newtp->copied_seq, seq);
+ WRITE_ONCE(newtp->rcv_nxt, seq);
newtp->segs_in = 1;
- newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+ seq = treq->snt_isn + 1;
+ newtp->snd_sml = newtp->snd_una = seq;
+ WRITE_ONCE(newtp->snd_nxt, seq);
+ newtp->snd_up = seq;
INIT_LIST_HEAD(&newtp->tsq_node);
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
@@ -494,7 +500,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->total_retrans = req->num_retrans;
tcp_init_xmit_timers(newsk);
- newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+ WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
if (sock_flag(newsk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(newsk,
@@ -540,7 +546,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.mss_clamp = req->mss;
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_req = NULL;
- newtp->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 979520e46e33..306e25d743e8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -38,6 +38,7 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -67,11 +68,14 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+ if (tp->highest_sack == NULL)
+ tp->highest_sack = skb;
+
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -411,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
#define OPTION_SMC (1 << 9)
+#define OPTION_MPTCP (1 << 10)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -436,8 +441,17 @@ struct tcp_out_options {
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct mptcp_out_options mptcp;
};
+static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (unlikely(OPTION_MPTCP & opts->options))
+ mptcp_write_options(ptr, &opts->mptcp);
+#endif
+}
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -546,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
smc_options_write(ptr, &options);
+
+ mptcp_options_write(ptr, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
@@ -581,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp,
#endif
}
+static void mptcp_set_option_cond(const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ if (rsk_is_mptcp(req)) {
+ unsigned int size;
+
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+ if (*remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ *remaining -= size;
+ }
+ }
+ }
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -650,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
smc_set_option(tp, opts, &remaining);
+ if (sk_is_mptcp(sk)) {
+ unsigned int size;
+
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
+ }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -711,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
}
+ mptcp_set_option_cond(req, opts, &remaining);
+
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -748,13 +791,35 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
size += TCPOLEN_TSTAMP_ALIGNED;
}
+ /* MPTCP options have precedence over SACK for the limited TCP
+ * option space because a MPTCP connection would be forced to
+ * fall back to regular TCP if a required multipath option is
+ * missing. SACK still gets a chance to use whatever space is
+ * left.
+ */
+ if (sk_is_mptcp(sk)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ unsigned int opt_size = 0;
+
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
+ &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ size += opt_size;
+ }
+ }
+
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK))
+ return size;
+
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
+
size += TCPOLEN_SACK_BASE_ALIGNED +
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
@@ -1050,11 +1115,22 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
- else
+ } else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
+ }
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
@@ -1185,10 +1261,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}
@@ -1322,7 +1398,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
@@ -1403,7 +1479,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- shinfo->frags[k].page_offset += eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
@@ -1432,7 +1508,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
if (delta_truesize) {
skb->truesize -= delta_truesize;
- sk->sk_wmem_queued -= delta_truesize;
+ sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
@@ -1713,7 +1789,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
u32 bytes, segs;
bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
@@ -1877,7 +1953,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
return -ENOMEM;
skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -2053,7 +2129,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len)
break;
- if (unlikely(TCP_SKB_CB(skb)->eor))
+ if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
return false;
len -= skb->len;
@@ -2141,7 +2217,7 @@ static int tcp_mtu_probe(struct sock *sk)
nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
@@ -2170,6 +2246,7 @@ static int tcp_mtu_probe(struct sock *sk)
* we need to propagate it to the new skb.
*/
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+ tcp_skb_collapse_tstamp(nskb, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
@@ -2247,7 +2324,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
- sk->sk_pacing_rate >> sk->sk_pacing_shift);
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
@@ -2425,6 +2502,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_small_queue_check(sk, skb, 0))
break;
+ /* Argh, we hit an empty skb(), presumably a thread
+ * is sleeping in sendmsg()/sk_stream_wait_memory().
+ * We do not want to send a pure-ack packet and have
+ * a strange looking rtx queue with empty packet(s).
+ */
+ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
+ break;
+
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2470,7 +2555,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return false;
early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
@@ -2841,7 +2926,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!tcp_can_collapse(sk, skb))
break;
- if (!tcp_skb_can_collapse_to(to))
+ if (!tcp_skb_can_collapse(to, skb))
break;
space -= skb->len;
@@ -3108,7 +3193,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
*/
void tcp_send_fin(struct sock *sk)
{
- struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and
@@ -3116,6 +3201,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
+ tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
@@ -3123,14 +3209,14 @@ void tcp_send_fin(struct sock *sk)
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (tcp_write_queue_empty(sk)) {
+ if (!tail) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
- tp->snd_nxt++;
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
return;
}
} else {
@@ -3207,10 +3293,11 @@ int tcp_send_synack(struct sock *sk)
if (!nskb)
return -ENOMEM;
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_highest_sack_replace(sk, skb, nskb);
tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -3278,7 +3365,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp_ns = cookie_init_timestamp(req);
+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
else
#endif
{
@@ -3343,8 +3430,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
- module_put(icsk->icsk_ca_ops->owner);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
@@ -3414,14 +3501,14 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
tp->rcv_tstamp = tcp_jiffies32;
tp->rcv_wup = tp->rcv_nxt;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
@@ -3435,9 +3522,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
tcb->end_seq += skb->len;
__skb_header_release(skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
- tp->write_seq = tcb->end_seq;
+ WRITE_ONCE(tp->write_seq, tcb->end_seq);
tp->packets_out += tcp_skb_pcount(skb);
}
@@ -3574,11 +3661,11 @@ int tcp_connect(struct sock *sk)
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
buff = tcp_send_head(sk);
if (unlikely(buff)) {
- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
tp->pushed_seq = TCP_SKB_CB(buff)->seq;
}
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c801cd37cc2a..c3f26dcd6704 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
} else {
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
- mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
@@ -198,8 +198,13 @@ static bool retransmits_timed_out(struct sock *sk,
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (likely(timeout == 0))
- timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);
+ if (likely(timeout == 0)) {
+ unsigned int rto_base = TCP_RTO_MIN;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ rto_base = tcp_timeout_init(sk);
+ timeout = tcp_model_timeout(sk, boundary, rto_base);
+ }
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
@@ -210,7 +215,7 @@ static int tcp_write_timeout(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- bool expired, do_reset;
+ bool expired = false, do_reset;
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
@@ -218,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
} else {
sk_rethink_txhash(sk);
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTIMEOUTREHASH);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
expired = icsk->icsk_retransmits >= retry_until;
@@ -229,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
} else {
sk_rethink_txhash(sk);
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTIMEOUTREHASH);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -242,9 +253,10 @@ static int tcp_write_timeout(struct sock *sk)
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
+ }
+ if (!expired)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
- }
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
@@ -380,15 +392,13 @@ abort: tcp_write_err(sk);
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
*/
-static void tcp_fastopen_synack_timer(struct sock *sk)
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct tcp_sock *tp = tcp_sk(sk);
- struct request_sock *req;
- req = tcp_sk(sk)->fastopen_rsk;
req->rsk_ops->syn_ack_timeout(req);
if (req->num_timeout >= max_retries) {
@@ -429,17 +439,26 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ struct sk_buff *skb;
- if (tp->fastopen_rsk) {
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- tcp_fastopen_synack_timer(sk);
+ tcp_fastopen_synack_timer(sk, req);
/* Before we receive ACK to our SYN-ACK don't retransmit
* anything else (e.g., data or FIN segments).
*/
return;
}
- if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
+
+ if (!tp->packets_out)
+ return;
+
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
return;
tp->tlp_high_seq = 0;
@@ -473,7 +492,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
+ tcp_retransmit_skb(sk, skb, 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 4849edb62d52..38d3ad141161 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -92,21 +92,26 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ulp_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
-void tcp_update_ulp(struct sock *sk, struct proto *proto)
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+ void (*write_space)(struct sock *sk))
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (!icsk->icsk_ulp_ops) {
+ sk->sk_write_space = write_space;
sk->sk_prot = proto;
return;
}
if (icsk->icsk_ulp_ops->update)
- icsk->icsk_ulp_ops->update(sk, proto);
+ icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
void tcp_cleanup_ulp(struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d88821c794fb..db76b9609299 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
}
@@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
+ if (sk->sk_reuseport &&
+ sk->sk_state != TCP_ESTABLISHED) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (result)
+ if (result && !reuseport_has_conns(sk, false))
return result;
}
badness = score;
@@ -820,6 +821,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
int is_udplite = IS_UDPLITE(sk);
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
__wsum csum = 0;
/*
@@ -853,10 +855,12 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
return -EIO;
}
- skb_shinfo(skb)->gso_size = cork->gso_size;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
- cork->gso_size);
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
+ }
goto csum_partial;
}
@@ -1130,7 +1134,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport,
@@ -1293,6 +1297,27 @@ out:
#define UDP_SKB_IS_STATELESS 0x80000000
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+ if (!skb_has_extensions(skb))
+ return true;
+
+ if (!secpath_exists(skb)) {
+ skb_ext_reset(skb);
+ return true;
+ }
+
+ return false;
+}
+
static void udp_set_dev_scratch(struct sk_buff *skb)
{
struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
@@ -1304,14 +1329,24 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- /* all head states execept sp (dst, sk, nf) are always cleared by
- * udp_rcv() and we need to preserve secpath, if present, to eventually
- * process IP_CMSG_PASSSEC at recvmsg() time
- */
- if (likely(!skb_sec_path(skb)))
+ if (udp_try_make_stateless(skb))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+ /* We come here after udp_lib_checksum_complete() returned 0.
+ * This means that __skb_checksum_complete() might have
+ * set skb->csum_valid to 1.
+ * On 64bit platforms, we can set csum_unnecessary
+ * to true, but only if the skb is not shared.
+ */
+#if BITS_PER_LONG == 64
+ if (!skb_shared(skb))
+ udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
static int udp_skb_truesize(struct sk_buff *skb)
{
return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
@@ -1333,7 +1368,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2))
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&up->reader_queue))
return;
} else {
size += up->forward_deficit;
@@ -1440,7 +1476,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
* queue contains some other skb
*/
rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
- if (rmem > (size + sk->sk_rcvbuf))
+ if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
goto uncharge_drop;
spin_lock(&list->lock);
@@ -1546,10 +1582,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
*total += skb->truesize;
kfree_skb(skb);
} else {
- /* the csum related bits could be changed, refresh
- * the scratch area
- */
- udp_set_dev_scratch(skb);
+ udp_skb_csum_unnecessary_set(skb);
break;
}
}
@@ -1573,7 +1606,7 @@ static int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
skb = __first_packet_length(sk, rcvq, &total);
- if (!skb && !skb_queue_empty(sk_queue)) {
+ if (!skb && !skb_queue_empty_lockless(sk_queue)) {
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, rcvq);
spin_unlock(&sk_queue->lock);
@@ -1646,7 +1679,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
return skb;
}
- if (skb_queue_empty(sk_queue)) {
+ if (skb_queue_empty_lockless(sk_queue)) {
spin_unlock_bh(&queue->lock);
goto busy_check;
}
@@ -1672,11 +1705,12 @@ busy_check:
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (!skb_queue_empty(sk_queue));
+ } while (!skb_queue_empty_lockless(sk_queue));
/* sk_queue is empty, reader_queue may contain peeked packets */
} while (timeo &&
- !__skb_wait_for_more_packets(sk, &error, &timeo,
+ !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+ &error, &timeo,
(struct sk_buff *)sk_queue));
*err = error;
@@ -1968,7 +2002,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
*/
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
@@ -2072,8 +2106,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, true);
- for (skb = segs; skb; skb = next) {
- next = skb->next;
+ skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
@@ -2297,7 +2330,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
@@ -2519,9 +2552,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP:
switch (val) {
case 0:
+#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
case UDP_ENCAP_ESPINUDP_NON_IKE:
up->encap_rcv = xfrm4_udp_encap_rcv;
+#endif
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
@@ -2708,7 +2743,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
__poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+ if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a3908e55ed89..1a98583a79f4 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -184,6 +184,20 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
+
+ return skb;
+}
+
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features)
{
@@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
__sum16 check;
__be16 newlen;
+ if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __udp_gso_segment_list(gso_skb, features);
+
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
@@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct udphdr *uh2;
struct sk_buff *p;
unsigned int ulen;
+ int ret = 0;
/* requires non zero csum, for symmetry with GSO */
if (!uh->check) {
@@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
}
/* pull encapsulating udp header */
skb_gro_pull(skb, sizeof(struct udphdr));
- skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
@@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
continue;
}
+ if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return p;
+ }
+
/* Terminate the flow on len mismatch or if it grow "too much".
* Under small packet flood GRO count could elsewhere grow a lot
* leading to excessive truesize values.
* On len mismatch merge the first packet shorter than gso_size,
* otherwise complete the GRO packet.
*/
- if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) ||
- ulen != ntohs(uh2->len) ||
+ if (ulen > ntohs(uh2->len)) {
+ pp = p;
+ } else {
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ if (!pskb_may_pull(skb, skb_gro_offset(skb))) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ if ((skb->ip_summed != p->ip_summed) ||
+ (skb->csum_level != p->csum_level)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ ret = skb_gro_receive_list(p, skb);
+ } else {
+ skb_gro_postpull_rcsum(skb, uh,
+ sizeof(struct udphdr));
+
+ ret = skb_gro_receive(p, skb);
+ }
+ }
+
+ if (ret || ulen != ntohs(uh2->len) ||
NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
pp = p;
@@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
return NULL;
}
-INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
- __be16 sport, __be16 dport));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+ struct udphdr *uh, struct sock *sk)
{
struct sk_buff *pp = NULL;
struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
- struct sock *sk;
- rcu_read_lock();
- sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
- udp4_lib_lookup_skb, skb, uh->source, uh->dest);
- if (!sk)
- goto out_unlock;
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
- if (udp_sk(sk)->gro_enabled) {
+ if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
- rcu_read_unlock();
return pp;
}
- if (NAPI_GRO_CB(skb)->encap_mark ||
+ if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid) ||
!udp_sk(sk)->gro_receive)
- goto out_unlock;
+ goto out;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
@@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
-out_unlock:
- rcu_read_unlock();
+out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sk_buff *pp;
+ struct sock *sk;
- if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
+ if (unlikely(!uh))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
@@ -480,11 +517,15 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
inet_gro_compute_pseudo))
goto flush;
else if (uh->check)
- skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
+ rcu_read_lock();
+ sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+ pp = udp_gro_receive(head, skb, uh, sk);
+ rcu_read_unlock();
+ return pp;
flush:
NAPI_GRO_CB(skb)->flush = 1;
@@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
rcu_read_lock();
sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
udp4_lib_lookup_skb, skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_enabled) {
- err = udp_gro_complete_segment(skb);
- } else if (sk && udp_sk(sk)->gro_complete) {
+ if (sk && udp_sk(sk)->gro_complete) {
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
@@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ } else {
+ err = udp_gro_complete_segment(skb);
}
rcu_read_unlock();
@@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
+ skb->csum_level++;
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->csum_level = 0;
+ }
+
+ return 0;
+ }
+
if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index ecff3fce9807..89ba7c87de5d 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst(skb)->dev,
__xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index cdef8f9a3b01..9ebd54752e03 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
RTCF_LOCAL);
xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
xdst->u.rt.rt_gw_family = rt->rt_gw_family;
if (rt->rt_gw_family == AF_INET)
xdst->u.rt.rt_gw4 = rt->rt_gw4;
@@ -99,12 +100,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
}
static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, sk, skb, mtu);
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8a4285712808..ea595c8549c7 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
if (!head)
goto out;
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ goto drop;
+ }
+
for_each_protocol_rcu(*head, handler)
if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
return ret;
@@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
out:
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
kfree_skb(skb);
return 0;
}
OpenPOWER on IntegriCloud