diff options
Diffstat (limited to 'net/ipv4')
40 files changed, 983 insertions, 391 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 03381f3e12ba..fc816b187170 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -180,8 +180,8 @@ config NET_IPIP config NET_IPGRE_DEMUX tristate "IP: GRE demultiplexer" help - This is helper module to demultiplex GRE packets on GRE version field criteria. - Required by ip_gre and pptp modules. + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. config NET_IP_TUNNEL tristate @@ -459,200 +459,200 @@ config TCP_CONG_BIC tristate "Binary Increase Congestion (BIC) control" default m ---help--- - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ config TCP_CONG_CUBIC tristate "CUBIC TCP" default y ---help--- - This is version 2.0 of BIC-TCP which uses a cubic growth function - among other techniques. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m ---help--- - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. config TCP_CONG_HTCP tristate "H-TCP" default m ---help--- - H-TCP is a send-side only modifications of the TCP Reno - protocol stack that optimizes the performance of TCP - congestion control for high speed network links. It uses a - modeswitch to change the alpha and beta parameters of TCP Reno - based on network conditions and in a way so as to be fair with - other Reno and H-TCP flows. + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. config TCP_CONG_HSTCP tristate "High Speed TCP" default n ---help--- - Sally Floyd's High Speed TCP (RFC 3649) congestion control. - A modification to TCP's congestion control mechanism for use - with large congestion windows. A table indicates how much to - increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" default n ---help--- - TCP-Hybla is a sender-side only change that eliminates penalization of - long-RTT, large-bandwidth connections, like when satellite legs are - involved, especially when sharing a common bottleneck with normal - terrestrial connections. + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, especially when sharing a common bottleneck with normal + terrestrial connections. config TCP_CONG_VEGAS tristate "TCP Vegas" default n ---help--- - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. config TCP_CONG_NV - tristate "TCP NV" - default n - ---help--- - TCP NV is a follow up to TCP Vegas. It has been modified to deal with - 10G networks, measurement noise introduced by LRO, GRO and interrupt - coalescence. In addition, it will decrease its cwnd multiplicatively - instead of linearly. + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. - Note that in general congestion avoidance (cwnd decreased when # packets - queued grows) cannot coexist with congestion control (cwnd decreased only - when there is packet loss) due to fairness issues. One scenario when they - can coexist safely is when the CA flows have RTTs << CC flows RTTs. + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. - For further details see http://www.brakmo.org/networking/tcp-nv/ + For further details see http://www.brakmo.org/networking/tcp-nv/ config TCP_CONG_SCALABLE tristate "Scalable TCP" default n ---help--- - Scalable TCP is a sender-side only change to TCP which uses a - MIMD congestion control algorithm which has some nice scaling - properties, though is known to have fairness issues. - See http://www.deneholme.net/tom/scalable/ + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www.deneholme.net/tom/scalable/ config TCP_CONG_LP tristate "TCP Low Priority" default n ---help--- - TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utilize only the excess network bandwidth as compared to the - ``fair share`` of bandwidth as targeted by TCP. - See http://www-ece.rice.edu/networks/TCP-LP/ + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utilize only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ config TCP_CONG_VENO tristate "TCP Veno" default n ---help--- - TCP Veno is a sender-side only enhancement of TCP to obtain better - throughput over wireless networks. TCP Veno makes use of state - distinguishing to circumvent the difficult judgment of the packet loss - type. TCP Veno cuts down less congestion window in response to random - loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" select TCP_CONG_VEGAS default n ---help--- - YeAH-TCP is a sender-side high-speed enabled TCP congestion control - algorithm, which uses a mixed loss/delay approach to compute the - congestion window. It's design goals target high efficiency, - internal, RTT and Reno fairness, resilience to link loss while - keeping network elements load as low as possible. + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. - For further details look here: - http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf config TCP_CONG_ILLINOIS tristate "TCP Illinois" default n ---help--- - TCP-Illinois is a sender-side modification of TCP Reno for - high speed long delay links. It uses round-trip-time to - adjust the alpha and beta parameters to achieve a higher average - throughput and maintain fairness. + TCP-Illinois is a sender-side modification of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. - For further details see: - http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html config TCP_CONG_DCTCP tristate "DataCenter TCP (DCTCP)" default n ---help--- - DCTCP leverages Explicit Congestion Notification (ECN) in the network to - provide multi-bit feedback to the end hosts. It is designed to provide: + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: - - High burst tolerance (incast due to partition/aggregate), - - Low latency (short flows, queries), - - High throughput (continuous data updates, large file transfers) with - commodity, shallow-buffered switches. + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. - All switches in the data center network running DCTCP must support - ECN marking and be configured for marking when reaching defined switch - buffer thresholds. The default ECN marking threshold heuristic for - DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets - (~100KB) at 10Gbps, but might need further careful tweaking. + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. - For further details see: - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf config TCP_CONG_CDG tristate "CAIA Delay-Gradient (CDG)" default n ---help--- - CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies - the TCP sender in order to: + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control. o Tolerate packet loss unrelated to congestion. - For further details see: - D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg config TCP_CONG_BBR tristate "BBR TCP" default n ---help--- - BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to - maximize network utilization and minimize queues. It builds an explicit - model of the the bottleneck delivery rate and path round-trip - propagation delay. It tolerates packet loss and delay unrelated to - congestion. It can operate over LAN, WAN, cellular, wifi, or cable - modem links. It can coexist with flows that use loss-based congestion - control, and can operate with shallow buffers, deep buffers, - bufferbloat, policers, or AQM schemes that do not provide a delay - signal. It requires the fq ("Fair Queue") pacing packet scheduler. + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. choice prompt "Default TCP congestion control" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70f92aaca411..2fe295432c24 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; - sk->sk_max_ack_backlog = backlog; + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -495,7 +495,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && snum < inet_prot_sock(net) && + if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 9a0fe0c2fa02..4a8550c49202 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = jiffies; + inet->inet_id = prandom_u32(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index dde77f72e03e..577db1d50a24 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -70,11 +70,6 @@ fail: fib_free_table(main_table); return -ENOMEM; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return false; -} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -131,11 +126,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } - -static bool fib4_has_custom_rules(struct net *net) -{ - return net->ipv4.fib_has_custom_rules; -} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -1148,7 +1138,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || - prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32) + (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include <net/netns/ipv4.h> #include <net/ip_fib.h> -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, @@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0913a090b2bf..f1888c683426 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1814,8 +1814,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); - int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ab2fb6bb37d..b9df9c09b84e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,11 +74,13 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -2015,10 +2017,12 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,39 +2036,53 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, + KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4298aae74e0e..18068ed42f25 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -249,10 +249,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -262,14 +263,14 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { credit--; rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -682,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 480d0b22db1a..3b9c7a2725a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) } } -static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a9183543ca30..e4c6e8b40490 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -716,7 +716,7 @@ static void reqsk_timer_handler(struct timer_list *t) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -906,7 +906,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, percpu_counter_inc(sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if @@ -915,7 +915,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ - tcp_sk(child)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } @@ -934,7 +934,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bbb005eb5218..af154977904c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), - .idiag_wmem = sk->sk_wmem_queued, + .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk->sk_forward_alloc, .idiag_tmem = sk_wmem_alloc_get(sk), }; @@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; @@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_timer.expires - jiffies; - if (tmo < 0) - tmo = 0; - inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 97824864e40d..83fb00153018 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score = sk->sk_family == PF_INET ? 2 : 1; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index be778599bfed..ff327a62c9ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base, base->total / inet_peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - delta = (__u32)jiffies - p->dtime; + + /* The READ_ONCE() pairs with the WRITE_ONCE() + * in inet_putpeer() + */ + delta = (__u32)jiffies - READ_ONCE(p->dtime); + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } @@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - p->dtime = (__u32)jiffies; + /* The WRITE_ONCE() pairs with itself (we run lockless) + * and the READ_ONCE() in inet_peer_gc() + */ + WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 52690bb3e40f..572b6307a2df 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + const struct iphdr *tnl_params; + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; @@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_pop_mac_header(skb); else skb_reset_mac_header(skb); - if (tunnel->collect_md) { + + tnl_params = &tunnel->parms.iph; + if (tunnel->collect_md || tnl_params->daddr == 0) { __be16 flags; __be64 tun_id; @@ -509,9 +513,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) goto err_free_skb; - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (tun_info->options_len < sizeof(*md)) goto err_free_skb; + md = ip_tunnel_info_opts(tun_info); /* ERSPAN has fixed 8 byte GRE header */ version = md->version; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c59a78a267c3..aa438c6758a7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -302,16 +302,31 @@ drop: return true; } +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && + ip_hdr(hint)->tos == iph->tos; +} + INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); struct rtable *rt; int err; + if (ip_can_use_hint(skb, iph, hint)) { + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, + dev, hint); + if (unlikely(err)) + goto drop_error; + } + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev); + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head) } } +static struct sk_buff *ip_extract_route_hint(const struct net *net, + struct sk_buff *skb, int rt_type) +{ + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + return NULL; + + return skb; +} + static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { + struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); @@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { + hint = ip_extract_route_hint(net, skb, + ((struct rtable *)dst)->rt_type); + /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); @@ -611,5 +638,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(&sublist, curr_dev, curr_net); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 28fca408812c..9d83cb320dcb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -422,7 +422,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); @@ -430,7 +430,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -645,11 +645,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) EXPORT_SYMBOL(ip_fraglist_prepare); void ip_frag_init(struct sk_buff *skb, unsigned int hlen, - unsigned int ll_rs, unsigned int mtu, + unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); + state->DF = DF; state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; @@ -668,9 +669,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; - if (IPCB(from)->flags & IPSKB_FRAG_PMTU) - state->iph->frag_off |= htons(IP_DF); - /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -738,6 +736,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); + if (state->DF) + iph->frag_off |= htons(IP_DF); /* * Added AC : If we are fragmenting a fragment that's not the @@ -771,6 +771,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; @@ -846,6 +847,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ip_fraglist_prepare(skb, &iter); } + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) @@ -881,7 +883,8 @@ slow_path: * Fragment the datagram. */ - ip_frag_init(skb, hlen, ll_rs, mtu, &state); + ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, + &state); /* * Keep copying data until we run out. @@ -900,6 +903,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + skb2->tstamp = tstamp; err = output(net, sk, skb2); if (err) goto fail; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1452a97914a0..47f8b947eef1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -34,6 +34,9 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; @@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) - return NULL; - res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); + src = &md->u.tun_info; + res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; - src = &md->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, @@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; + ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), + src->options_len, 0); return res; } @@ -211,30 +215,243 @@ void ip_tunnel_get_stats64(struct net_device *dev, EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { + [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, + [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { + [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static const struct nla_policy +vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; +static const struct nla_policy +erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { + [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, + [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, + [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, +}; + +static int ip_tun_parse_opts_geneve(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + int data_len, err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, + geneve_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || + !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || + !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) + return -EINVAL; + + attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + if (info) { + struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(attr); + attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(attr); + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int ip_tun_parse_opts_vxlan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, + vxlan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) + return -EINVAL; + + if (info) { + struct vxlan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; + md->gbp = nla_get_u32(attr); + info->key.tun_flags |= TUNNEL_VXLAN_OPT; + } + + return sizeof(struct vxlan_metadata); +} + +static int ip_tun_parse_opts_erspan(struct nlattr *attr, + struct ip_tunnel_info *info, int opts_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + int err; + u8 ver; + + err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, + erspan_opt_policy, extack); + if (err) + return err; + + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) + return -EINVAL; + + ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); + if (ver == 1) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) + return -EINVAL; + } else if (ver == 2) { + if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || + !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) + return -EINVAL; + } else { + return -EINVAL; + } + + if (info) { + struct erspan_metadata *md = + ip_tunnel_info_opts(info) + opts_len; + + md->version = ver; + if (ver == 1) { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; + md->u.index = nla_get_be32(attr); + } else { + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; + md->u.md2.dir = nla_get_u8(attr); + attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; + set_hwid(&md->u.md2, nla_get_u8(attr)); + } + + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + } + + return sizeof(struct erspan_metadata); +} + +static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, opts_len = 0, type = 0; + struct nlattr *nla; + + if (!attr) + return 0; + + err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, + ip_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case LWTUNNEL_IP_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (opts_len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + type = TUNNEL_GENEVE_OPT; + break; + case LWTUNNEL_IP_OPTS_VXLAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_VXLAN_OPT; + break; + case LWTUNNEL_IP_OPTS_ERSPAN: + if (type) + return -EINVAL; + opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, + extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + type = TUNNEL_ERSPAN_OPT; + break; + default: + return -EINVAL; + } + } + + return opts_len; +} + +static int ip_tun_get_optlen(struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, NULL, extack); +} + +static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, + struct netlink_ext_ack *extack) +{ + return ip_tun_parse_opts(attr, info, extack); +} + static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -242,6 +459,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { @@ -266,10 +489,12 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -285,6 +510,114 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) #endif } +static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct geneve_opt *opt; + struct nlattr *nest; + int offset = 0; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); + if (!nest) + return -ENOMEM; + + while (tun_info->options_len > offset) { + opt = ip_tunnel_info_opts(tun_info) + offset; + if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || + nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, + opt->opt_data)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + offset += sizeof(*opt) + opt->length * 4; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct vxlan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { + nla_nest_cancel(skb, nest); + return -ENOMEM; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct erspan_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); + if (!nest) + return -ENOMEM; + + md = ip_tunnel_info_opts(tun_info); + if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) + goto err; + + if (md->version == 1 && + nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) + goto err; + + if (md->version == 2 && + (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || + nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, + get_hwid(&md->u.md2)))) + goto err; + + nla_nest_end(skb, nest); + return 0; +err: + nla_nest_cancel(skb, nest); + return -ENOMEM; +} + +static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nest; + int err = 0; + + if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + nest = nla_nest_start_noflag(skb, type); + if (!nest) + return -ENOMEM; + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + err = ip_tun_fill_encap_opts_geneve(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); + else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + err = ip_tun_fill_encap_opts_erspan(skb, tun_info); + + if (err) { + nla_nest_cancel(skb, nest); + return err; + } + + nla_nest_end(skb, nest); + return 0; +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -296,12 +629,52 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } +static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) +{ + int opt_len; + + if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + return 0; + + opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt; + int offset = 0; + + opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ + while (info->options_len > offset) { + opt = ip_tunnel_info_opts(info) + offset; + opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + + nla_total_size(1) /* OPT_GENEVE_TYPE */ + + nla_total_size(opt->length * 4); + /* OPT_GENEVE_DATA */ + offset += sizeof(*opt) + opt->length * 4; + } + } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + + nla_total_size(4); /* OPT_VXLAN_GBP */ + } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + struct erspan_metadata *md = ip_tunnel_info_opts(info); + + opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + + nla_total_size(1) /* OPT_ERSPAN_VER */ + + (md->version == 1 ? nla_total_size(4) + /* OPT_ERSPAN_INDEX (v1) */ + : nla_total_size(1) + + nla_total_size(1)); + /* OPT_ERSPAN_DIR + HWID (v2) */ + } + + return opt_len; +} + static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ @@ -309,13 +682,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { - return memcmp(lwt_tun_info(a), lwt_tun_info(b), - sizeof(struct ip_tunnel_info)); + struct ip_tunnel_info *info_a = lwt_tun_info(a); + struct ip_tunnel_info *info_b = lwt_tun_info(b); + + return memcmp(info_a, info_b, sizeof(info_a->key)) || + info_a->mode != info_b->mode || + info_a->options_len != info_b->options_len || + memcmp(ip_tunnel_info_opts(info_a), + ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { @@ -328,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct nlattr *attr, @@ -341,17 +724,21 @@ static int ip6_tun_build_state(struct nlattr *attr, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { - struct ip_tunnel_info *tun_info; - struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; - int err; + struct lwtunnel_state *new_state; + struct ip_tunnel_info *tun_info; + int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; - new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); + if (opt_len < 0) + return opt_len; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; @@ -359,6 +746,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); + err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); + if (err < 0) { + lwtstate_free(new_state); + return err; + } + if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); @@ -375,10 +768,12 @@ static int ip6_tun_build_state(struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags |= + (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & + ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options_len = 0; + tun_info->options_len = opt_len; *ts = new_state; @@ -396,7 +791,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; @@ -409,7 +805,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ + + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); + /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9bcca08efec9..f35308ff84c3 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void) struct net_device *dev; int found = 0; + /* make sure deferred device probes are finished */ + wait_for_device_probe(); + rtnl_lock(); for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { @@ -1483,10 +1486,10 @@ static int __init ip_auto_config(void) * missing values. */ if (ic_myaddr == NONE || -#ifdef CONFIG_ROOT_NFS +#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT) (root_server_addr == NONE && ic_servaddr == NONE && - ROOT_DEV == Root_NFS) || + (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1513,6 +1516,12 @@ static int __init ip_auto_config(void) goto try_try_again; } #endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + pr_err("IP-Config: Retrying forever (CIFS root)...\n"); + goto try_try_again; + } +#endif if (--retries) { pr_err("IP-Config: Reopening network devices...\n"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 716d5472c022..6e68def66822 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -2289,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return -ENODEV; } - skb2 = skb_clone(skb, GFP_ATOMIC); + + skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { read_unlock(&mrt_lock); rcu_read_unlock(); @@ -3040,10 +3043,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id, extack); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id, extack); + if (err) + return err; + } } return 0; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 012c4047c788..e32e41b99f0f 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -9,6 +9,8 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv4, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..c94445b44d8c 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, if (icmph == NULL) return 1; - switch (icmph->type) { - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_REDIRECT: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - break; - default: + if (!icmp_is_err(icmph->type)) return 1; - } inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fc34fd1668d6..511eaa94e2d1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { - [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 14654876127e..f88c93c38f11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1482,7 +1482,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { - dst_dev_put(&orig->dst); + rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { @@ -1894,10 +1894,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, @@ -2022,10 +2019,52 @@ static int ip_mkroute_input(struct sk_buff *skb, return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } +/* Implements all the saddr-related checks as ip_route_input_slow(), + * assuming daddr is valid and the destination is not a local broadcast one. + * Uses the provided hint instead of performing a route lookup. + */ +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + const struct sk_buff *hint) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt = (struct rtable *)hint; + struct net *net = dev_net(dev); + int err = -EINVAL; + u32 tag = 0; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + + if (rt->rt_type != RTN_LOCAL) + goto skip_validate_source; + + tos &= IPTOS_RT_MASK; + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + if (err < 0) + goto martian_source; + +skip_validate_source: + skb_dst_copy(skb, hint); + return 0; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + return err; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. + * Changes in the enforced policies must be applied also to + * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. @@ -2470,14 +2509,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; - int err = -ENETUNREACH; + int err; if (fl4->saddr) { - rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) + ipv4_is_zeronet(fl4->saddr)) { + rth = ERR_PTR(-EINVAL); goto out; + } + + rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 535b69326f66..345b2b0ff618 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + u32 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; ireq = inet_rsk(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 59ded25acd04..fcb2cd167f64 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); + + if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) + break; + if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } @@ -1037,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f98a1882e537..8a39ee794891 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,7 +326,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +341,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) @@ -450,8 +450,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; @@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (tp->rcv_nxt - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -543,10 +543,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && - (state != TCP_SYN_RECV || tp->fastopen_rsk)) { + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == tp->copied_seq && + if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -584,7 +584,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; @@ -607,7 +607,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = tp->urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -616,7 +617,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -625,7 +626,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; @@ -657,7 +659,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; @@ -1032,10 +1034,10 @@ new_segment: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1362,7 +1364,7 @@ new_segment: if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1668,9 +1670,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -1699,7 +1701,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); @@ -1709,7 +1711,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) val <<= 1; if (val > sk->sk_rcvbuf) { - sk->sk_rcvbuf = val; + WRITE_ONCE(sk->sk_rcvbuf, val); tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); } return 0; @@ -1739,8 +1741,8 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; + u32 length = 0, seq, offset, zap_len; const skb_frag_t *frags = NULL; - u32 length = 0, seq, offset; struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; @@ -1767,12 +1769,12 @@ static int tcp_zerocopy_receive(struct sock *sk, seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); - zc->length &= ~(PAGE_SIZE - 1); - if (zc->length) { - zap_page_range(vma, address, zc->length); + zap_len = zc->length & ~(PAGE_SIZE - 1); + if (zap_len) { + zap_page_range(vma, address, zap_len); zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = inq; + zc->recv_skip_hint = zc->length; } ret = 0; while (length + PAGE_SIZE <= zc->length) { @@ -1819,7 +1821,7 @@ static int tcp_zerocopy_receive(struct sock *sk, out: up_read(¤t->mm->mmap_sem); if (length) { - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ @@ -1862,29 +1864,33 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { - struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec}; - + struct __kernel_timespec kts = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { - struct timespec ts_old = timespec64_to_timespec(tss->ts[0]); - + struct __kernel_old_timespec ts_old = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { - struct __kernel_sock_timeval stv; - - stv.tv_sec = tss->ts[0].tv_sec; - stv.tv_usec = tss->ts[0].tv_nsec / 1000; + struct __kernel_sock_timeval stv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { - struct __kernel_old_timeval tv; - - tv.tv_sec = tss->ts[0].tv_sec; - tv.tv_usec = tss->ts[0].tv_nsec / 1000; + struct __kernel_old_timeval tv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } @@ -1956,13 +1962,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping_internal tss; - bool has_tss = false; - bool has_cmsg; + int cmsg_flags; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -1972,7 +1977,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; - has_cmsg = tp->recvmsg_inq; + cmsg_flags = tp->recvmsg_inq ? 1 : 0; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2045,7 +2050,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { @@ -2117,7 +2122,7 @@ found_ok_skb: if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -2139,7 +2144,7 @@ found_ok_skb: } } - *seq += used; + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2155,8 +2160,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); - has_tss = true; - has_cmsg = true; + cmsg_flags |= 2; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -2166,7 +2170,7 @@ skip_copy: found_fin_ok: /* Process the FIN. */ - ++*seq; + WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; @@ -2181,10 +2185,10 @@ found_fin_ok: release_sock(sk); - if (has_cmsg) { - if (has_tss) + if (cmsg_flags) { + if (cmsg_flags & 2) tcp_recv_timestamp(msg, sk, &tss); - if (tp->recvmsg_inq) { + if (cmsg_flags & 1) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } @@ -2487,7 +2491,10 @@ adjudge_to_death: } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. @@ -2559,6 +2566,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2585,7 +2593,7 @@ int tcp_disconnect(struct sock *sk, int flags) __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); @@ -2601,9 +2609,12 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; - tp->write_seq += tp->max_window + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; @@ -2657,6 +2668,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -2930,9 +2942,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sk->sk_state != TCP_CLOSE) err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; + WRITE_ONCE(tp->write_seq, val); else if (tp->repair_queue == TCP_RECV_QUEUE) - tp->rcv_nxt = val; + WRITE_ONCE(tp->rcv_nxt, val); else err = -EINVAL; break; @@ -3215,8 +3227,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } @@ -3296,6 +3308,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3831,7 +3844,13 @@ EXPORT_SYMBOL(tcp_md5_hash_key); void tcp_done(struct sock *sk) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + /* We might be called with a new socket, after + * inet_csk_prepare_forced_close() has been called + * so we can not use lockdep_sock_is_held(sk) + */ + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 8a56e09cfb0e..e38705165ac9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = msg->sg.start == msg->sg.end; + bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; int ret; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c445a81d144e..3737ec096650 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 81a8221d650a..0d08f9e2d8d0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,13 +21,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; + r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3fd451271a70..19ad9586c720 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ tp = tcp_sk(child); - tp->fastopen_rsk = req; + rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never @@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return true; } - return cookie->len > 0; + if (cookie->len > 0) + return true; + tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; + return false; } /* This function checks if we want to defer sending SYN until the first diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3578357abe30..88b987ca9ebb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,8 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); + WRITE_ONCE(sk->sk_sndbuf, + min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -483,8 +484,9 @@ static void tcp_clamp_window(struct sock *sk) !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2]); + WRITE_ONCE(sk->sk_rcvbuf, + min(atomic_read(&sk->sk_rmem_alloc), + net->ipv4.sysctl_tcp_rmem[2])); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -648,7 +650,7 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, rcvwin * rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { - sk->sk_rcvbuf = rcvbuf; + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ tp->window_clamp = tcp_win_from_space(sk, rcvbuf); @@ -2666,7 +2668,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); - if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) && + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; @@ -2990,7 +2992,7 @@ void tcp_rearm_rto(struct sock *sk) /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { @@ -3362,7 +3364,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; - tp->rcv_nxt = seq; + WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. @@ -5356,7 +5358,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; @@ -5812,6 +5814,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ + if (tp->total_retrans) + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; + else + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; @@ -5932,7 +5938,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Ok.. it's good. Set up sequence numbers and * move to established. */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5961,7 +5967,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); @@ -6035,8 +6041,8 @@ discard: tp->tcp_header_len = sizeof(struct tcphdr); } - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6087,6 +6093,8 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct request_sock *req; + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ @@ -6096,7 +6104,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); + reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case @@ -6171,7 +6181,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; - req = tp->fastopen_rsk; + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); if (req) { bool req_stolen; @@ -6211,7 +6222,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bf124b1742df..92282f98dc82 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && - (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || - (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && - (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif @@ -164,9 +162,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) * without appearing to create any others. */ if (likely(!tp->repair)) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + u32 seq = tcptw->tw_snd_nxt + 65535 + 2; + + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } @@ -253,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; @@ -291,16 +291,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port); + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); tp->tsoffset = secure_tcp_ts_off(sock_net(sk), inet->inet_saddr, inet->inet_daddr); } - inet->inet_id = tp->write_seq ^ jiffies; + inet->inet_id = prandom_u32(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -478,7 +479,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -1447,7 +1448,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = newtp->write_seq ^ jiffies; + newinet->inet_id = prandom_u32(); if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); @@ -1644,7 +1645,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2121,7 +2122,7 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk); + BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); @@ -2450,17 +2451,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) state = inet_sk_state_load(sk); if (state == TCP_LISTEN) - rx_queue = sk->sk_ack_backlog; + rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), @@ -2677,7 +2679,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; - net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb140a5db8c0..c802bc80c400 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -462,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; + u32 seq; if (!newsk) return NULL; @@ -475,12 +476,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, /* Now setup tcp_sock */ newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; + seq = treq->rcv_isn + 1; + newtp->rcv_wup = seq; + WRITE_ONCE(newtp->copied_seq, seq); + WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); @@ -495,7 +500,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, @@ -541,7 +546,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fec6d67bfd14..be6d22b8190f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,7 +67,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); @@ -1196,10 +1196,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -1333,7 +1333,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; @@ -1443,7 +1443,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; - sk->sk_wmem_queued -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } @@ -1888,7 +1888,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return -ENOMEM; skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2152,7 +2152,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -2482,7 +2482,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; @@ -3142,7 +3142,7 @@ void tcp_send_fin(struct sock *sk) * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ - tp->snd_nxt++; + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { @@ -3222,7 +3222,7 @@ int tcp_send_synack(struct sock *sk) tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -3290,7 +3290,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { @@ -3426,14 +3426,14 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; @@ -3447,9 +3447,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); - tp->write_seq = tcb->end_seq; + WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } @@ -3586,11 +3586,11 @@ int tcp_connect(struct sock *sk) /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { - tp->snd_nxt = TCP_SKB_CB(buff)->seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 05be564414e9..dd5a6317a801 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -386,15 +386,13 @@ abort: tcp_write_err(sk); * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ -static void tcp_fastopen_synack_timer(struct sock *sk) +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); - struct request_sock *req; - req = tcp_sk(sk)->fastopen_rsk; req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { @@ -435,11 +433,14 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; - if (tp->fastopen_rsk) { + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - tcp_fastopen_synack_timer(sk); + tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 4849edb62d52..12ab5db2b71c 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -92,6 +92,9 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 14bc654b6842..4da5758cc718 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net, return -1; score += 4; - if (sk->sk_incoming_cpu == raw_smp_processor_id()) + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } @@ -1297,6 +1297,27 @@ out: #define UDP_SKB_IS_STATELESS 0x80000000 +/* all head states (dst, sk, nf conntrack) except skb extensions are + * cleared by udp_rcv(). + * + * We need to preserve secpath, if present, to eventually process + * IP_CMSG_PASSSEC at recvmsg() time. + * + * Other extensions can be cleared. + */ +static bool udp_try_make_stateless(struct sk_buff *skb) +{ + if (!skb_has_extensions(skb)) + return true; + + if (!secpath_exists(skb)) { + skb_ext_reset(skb); + return true; + } + + return false; +} + static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); @@ -1308,14 +1329,24 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif - /* all head states execept sp (dst, sk, nf) are always cleared by - * udp_rcv() and we need to preserve secpath, if present, to eventually - * process IP_CMSG_PASSSEC at recvmsg() time - */ - if (likely(!skb_sec_path(skb))) + if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } +static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) +{ + /* We come here after udp_lib_checksum_complete() returned 0. + * This means that __skb_checksum_complete() might have + * set skb->csum_valid to 1. + * On 64bit platforms, we can set csum_unnecessary + * to true, but only if the skb is not shared. + */ +#if BITS_PER_LONG == 64 + if (!skb_shared(skb)) + udp_skb_scratch(skb)->csum_unnecessary = true; +#endif +} + static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; @@ -1550,10 +1581,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, *total += skb->truesize; kfree_skb(skb); } else { - /* the csum related bits could be changed, refresh - * the scratch area - */ - udp_set_dev_scratch(skb); + udp_skb_csum_unnecessary_set(skb); break; } } @@ -1577,7 +1605,7 @@ static int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); - if (!skb && !skb_queue_empty(sk_queue)) { + if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); @@ -1650,7 +1678,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, return skb; } - if (skb_queue_empty(sk_queue)) { + if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } @@ -1676,7 +1704,7 @@ busy_check: break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (!skb_queue_empty(sk_queue)); + } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && @@ -2523,9 +2551,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_ENCAP: switch (val) { case 0: +#ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; +#endif /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; @@ -2712,7 +2742,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; - if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ecff3fce9807..89ba7c87de5d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } |