From 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Dec 2017 18:22:52 -0800
Subject: tcp: refresh tcp_mstamp from timers callbacks

Only the retransmit timer currently refreshes tcp_mstamp

We should do the same for delayed acks and keepalives.

Even if RFC 7323 does not request it, this is consistent to what linux
did in the past, when TS values were based on jiffies.

Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Mike Maloney <maloney@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by:  Mike Maloney <maloney@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 16df6dd44b98..968fda198376 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk)
 			icsk->icsk_ack.pingpong = 0;
 			icsk->icsk_ack.ato      = TCP_ATO_MIN;
 		}
+		tcp_mstamp_refresh(tcp_sk(sk));
 		tcp_send_ack(sk);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
 	}
@@ -632,6 +633,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
 		goto out;
 	}
 
+	tcp_mstamp_refresh(tp);
 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 		if (tp->linger2 >= 0) {
 			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
-- 
cgit v1.2.3


From 4ee806d51176ba7b8ff1efd81f271d7252e03a1d Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 18 Jan 2018 16:14:26 -0500
Subject: net: tcp: close sock if net namespace is exiting

When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.

For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open.  However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open.  In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence.  The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:

unregister_netdevice: waiting for lo to become free. Usage count = 1

These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.

After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.

Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman <ddstreet@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 10 ++++++++++
 net/ipv4/tcp.c              |  3 +++
 net/ipv4/tcp_timer.c        | 15 +++++++++++++++
 3 files changed, 28 insertions(+)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 10f99dafd5ac..049008493faf 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 	return net1 == net2;
 }
 
+static inline int check_net(const struct net *net)
+{
+	return atomic_read(&net->count) != 0;
+}
+
 void net_drop_ns(void *);
 
 #else
@@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 	return 1;
 }
 
+static inline int check_net(const struct net *net)
+{
+	return 1;
+}
+
 #define net_drop_ns NULL
 #endif
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f08eebe60446..8e053ad7cae2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2298,6 +2298,9 @@ adjudge_to_death:
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			__NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPABORTONMEMORY);
+		} else if (!check_net(sock_net(sk))) {
+			/* Not possible to send reset; just close */
+			tcp_set_state(sk, TCP_CLOSE);
 		}
 	}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 968fda198376..388158c9d9f6 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk)
  *  to prevent DoS attacks. It is called when a retransmission timeout
  *  or zero probe timeout occurs on orphaned socket.
  *
+ *  Also close if our net namespace is exiting; in that case there is no
+ *  hope of ever communicating again since all netns interfaces are already
+ *  down (or about to be down), and we need to release our dst references,
+ *  which have been moved to the netns loopback interface, so the namespace
+ *  can finish exiting.  This condition is only possible if we are a kernel
+ *  socket, as those do not hold references to the namespace.
+ *
  *  Criteria is still not confirmed experimentally and may change.
  *  We kill the socket, if:
  *  1. If number of orphaned sockets exceeds an administratively configured
  *     limit.
  *  2. If we have strong memory pressure.
+ *  3. If our net namespace is exiting.
  */
 static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 {
@@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
 		return 1;
 	}
+
+	if (!check_net(sock_net(sk))) {
+		/* Not possible to send reset; just close */
+		tcp_done(sk);
+		return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3