From 391bb6be6578829540bc466fc85da44a68148b84 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 23 Aug 2016 11:39:28 -0700
Subject: ipv6: tcp: get rid of tcp_v6_clear_sk()

Now RCU lookups of IPv6 TCP sockets no longer dereference pinet6,
we do not need tcp_v6_clear_sk() anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33df8b8575cc..e0f46439e391 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1862,17 +1862,6 @@ void tcp6_proc_exit(struct net *net)
 }
 #endif
 
-static void tcp_v6_clear_sk(struct sock *sk, int size)
-{
-	struct inet_sock *inet = inet_sk(sk);
-
-	/* we do not want to clear pinet6 field, because of RCU lookups */
-	sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
-
-	size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
-	memset(&inet->pinet6 + 1, 0, size);
-}
-
 struct proto tcpv6_prot = {
 	.name			= "TCPv6",
 	.owner			= THIS_MODULE,
@@ -1914,7 +1903,6 @@ struct proto tcpv6_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
-	.clear_sk		= tcp_v6_clear_sk,
 	.diag_destroy		= tcp_abort,
 };
 
-- 
cgit v1.2.3


From e65c332de8a0c9f570847ec490b60effffcd5320 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Aug 2016 08:50:24 -0700
Subject: tcp: md5: increment sk_drops on syn_recv state

TCP MD5 mismatches do increment sk_drops counter in all states but
SYN_RECV.

This is very unlikely to happen in the real world, but worth adding
to help diagnostics.

We increase the parent (listener) sk_drops.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 1 +
 net/ipv6/tcp_ipv6.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 32b048e524d6..436d978c6c39 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1602,6 +1602,7 @@ process:
 
 		sk = req->rsk_listener;
 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+			sk_drops_add(sk, skb);
 			reqsk_put(req);
 			goto discard_it;
 		}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e0f46439e391..ac0ed7bda406 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1409,6 +1409,7 @@ process:
 		sk = req->rsk_listener;
 		tcp_v6_fill_cb(skb, hdr, th);
 		if (tcp_v6_inbound_md5_hash(sk, skb)) {
+			sk_drops_add(sk, skb);
 			reqsk_put(req);
 			goto discard_it;
 		}
-- 
cgit v1.2.3


From 72145a68e4ee116533df49af4b87aca0aacc179c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Aug 2016 09:01:23 -0700
Subject: tcp: md5: add LINUX_MIB_TCPMD5FAILURE counter

Adds SNMP counter for drops caused by MD5 mismatches.

The current syslog might help, but a counter is more precise and helps
monitoring.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_ipv4.c       | 1 +
 net/ipv6/tcp_ipv6.c       | 1 +
 4 files changed, 4 insertions(+)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 25a9ad8bcef1..e7a31f830690 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -235,6 +235,7 @@ enum
 	LINUX_MIB_TCPSPURIOUSRTOS,		/* TCPSpuriousRTOs */
 	LINUX_MIB_TCPMD5NOTFOUND,		/* TCPMD5NotFound */
 	LINUX_MIB_TCPMD5UNEXPECTED,		/* TCPMD5Unexpected */
+	LINUX_MIB_TCPMD5FAILURE,		/* TCPMD5Failure */
 	LINUX_MIB_SACKSHIFTED,
 	LINUX_MIB_SACKMERGED,
 	LINUX_MIB_SACKSHIFTFALLBACK,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9f665b63a927..1ed015e4bc79 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
 	SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
 	SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+	SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
 	SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
 	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
 	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 436d978c6c39..ad41e8ecf796 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1169,6 +1169,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
 				      NULL, skb);
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
 				     &iph->saddr, ntohs(th->source),
 				     &iph->daddr, ntohs(th->dest),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ac0ed7bda406..e4f55683af31 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
 				      NULL, skb);
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
 				     genhash ? "failed" : "mismatch",
 				     &ip6h->saddr, ntohs(th->source),
-- 
cgit v1.2.3


From c9c3321257e1b95be9b375f811fb250162af8d39 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Aug 2016 07:37:54 -0700
Subject: tcp: add tcp_add_backlog()

When TCP operates in lossy environments (between 1 and 10 % packet
losses), many SACK blocks can be exchanged, and I noticed we could
drop them on busy senders, if these SACK blocks have to be queued
into the socket backlog.

While the main cause is the poor performance of RACK/SACK processing,
we can try to avoid these drops of valuable information that can lead to
spurious timeouts and retransmits.

Cause of the drops is the skb->truesize overestimation caused by :

- drivers allocating ~2048 (or more) bytes as a fragment to hold an
  Ethernet frame.

- various pskb_may_pull() calls bringing the headers into skb->head
  might have pulled all the frame content, but skb->truesize could
  not be lowered, as the stack has no idea of each fragment truesize.

The backlog drops are also more visible on bidirectional flows, since
their sk_rmem_alloc can be quite big.

Let's add some room for the backlog, as only the socket owner
can selectively take action to lower memory needs, like collapsing
receive queues or partial ofo pruning.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  1 +
 net/ipv4/tcp_ipv4.c | 33 +++++++++++++++++++++++++++++----
 net/ipv6/tcp_ipv6.c |  5 +----
 3 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a5af6be3a572..dd99679e2e51 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1161,6 +1161,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 }
 
 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
 
 #undef STATE_TRACE
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad41e8ecf796..53e80cd004b6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1532,6 +1532,34 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_prequeue);
 
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
+{
+	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+
+	/* Only socket owner can try to collapse/prune rx queues
+	 * to reduce memory overhead, so add a little headroom here.
+	 * Few sockets backlog are possibly concurrently non empty.
+	 */
+	limit += 64*1024;
+
+	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
+	 * we can fix skb->truesize to its real value to avoid future drops.
+	 * This is valid because skb is not yet charged to the socket.
+	 * It has been noticed pure SACK packets were sometimes dropped
+	 * (if cooked by drivers without copybreak feature).
+	 */
+	if (!skb->data_len)
+		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+
+	if (unlikely(sk_add_backlog(sk, skb, limit))) {
+		bh_unlock_sock(sk);
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_add_backlog);
+
 /*
  *	From tcp_input.c
  */
@@ -1662,10 +1690,7 @@ process:
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v4_do_rcv(sk, skb);
-	} else if (unlikely(sk_add_backlog(sk, skb,
-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
-		bh_unlock_sock(sk);
-		__NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+	} else if (tcp_add_backlog(sk, skb)) {
 		goto discard_and_relse;
 	}
 	bh_unlock_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e4f55683af31..5bf460bd299f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1467,10 +1467,7 @@ process:
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v6_do_rcv(sk, skb);
-	} else if (unlikely(sk_add_backlog(sk, skb,
-					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
-		bh_unlock_sock(sk);
-		__NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+	} else if (tcp_add_backlog(sk, skb)) {
 		goto discard_and_relse;
 	}
 	bh_unlock_sock(sk);
-- 
cgit v1.2.3


From e0d56fdd734224666e7bd5fafbc620286d2a7ee8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:57 -0700
Subject: net: l3mdev: remove redundant calls

A previous patch added l3mdev flow update making these hooks
redundant. Remove them.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c    |  3 +--
 net/ipv4/route.c        | 12 ++----------
 net/ipv4/xfrm4_policy.c |  2 +-
 net/ipv6/ip6_output.c   |  2 --
 net/ipv6/ndisc.c        | 11 ++---------
 net/ipv6/route.c        |  7 +------
 net/ipv6/tcp_ipv6.c     |  8 ++------
 net/ipv6/xfrm6_policy.c |  2 +-
 8 files changed, 10 insertions(+), 37 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 41e10e34769c..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1582,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	}
 
 	oif = arg->bound_dev_if;
-	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
-		oif = skb->skb_iif;
+	oif = oif ? : skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
 			   IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ad83f85fb240..b52496fd5107 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1831,7 +1831,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	 *	Now we are ready to route packet.
 	 */
 	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
+	fl4.flowi4_iif = dev->ifindex;
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2150,7 +2150,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	int master_idx;
 	int orig_oif;
 	int err = -ENETUNREACH;
 
@@ -2160,9 +2159,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 
 	orig_oif = fl4->flowi4_oif;
 
-	master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
-	if (master_idx)
-		fl4->flowi4_oif = master_idx;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2263,8 +2259,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	if (err) {
 		res.fi = NULL;
 		res.table = NULL;
-		if (fl4->flowi4_oif &&
-		    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+		if (fl4->flowi4_oif) {
 			/* Apparently, routing tables are wrong. Assume,
 			   that the destination is on link.
 
@@ -2577,9 +2572,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 
-	if (netif_index_is_l3_master(net, fl4.flowi4_oif))
-		fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
-
 	if (iif) {
 		struct net_device *dev;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b644a23c3db0..3155ed73d3b3 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	int oif = 0;
 
 	if (skb_dst(skb))
-		oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl4, 0, sizeof(struct flowi4));
 	fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6ea6caace3a8..1cb41b365048 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1070,8 +1070,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
 		return ERR_PTR(err);
 	if (final_dst)
 		fl6->daddr = *final_dst;
-	if (!fl6->flowi6_oif)
-		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
 
 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fe65cdc28a45..d8e671457d10 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,7 +67,6 @@
 #include <net/flow.h>
 #include <net/ip6_checksum.h>
 #include <net/inet_common.h>
-#include <net/l3mdev.h>
 #include <linux/proc_fs.h>
 
 #include <linux/netfilter.h>
@@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
 
 	if (!dst) {
 		struct flowi6 fl6;
-		int oif = l3mdev_fib_oif(skb->dev);
+		int oif = skb->dev->ifindex;
 
 		icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
-		if (oif != skb->dev->ifindex)
-			fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
 		dst = icmp6_dst_alloc(skb->dev, &fl6);
 		if (IS_ERR(dst)) {
 			kfree_skb(skb);
@@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	int rd_len;
 	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
 	   ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
-	int oif = l3mdev_fib_oif(dev);
 	bool ret;
 
 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	}
 
 	icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
-			 &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
-
-	if (oif != skb->dev->ifindex)
-		fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
+			 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 87e0a01ce744..ad4a7ff301fc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1164,7 +1164,7 @@ void ip6_route_input(struct sk_buff *skb)
 	int flags = RT6_LOOKUP_F_HAS_SADDR;
 	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
-		.flowi6_iif = l3mdev_fib_oif(skb->dev),
+		.flowi6_iif = skb->dev->ifindex,
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 		.flowlabel = ip6_flowinfo(iph),
@@ -3349,11 +3349,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	} else {
 		fl6.flowi6_oif = oif;
 
-		if (netif_index_is_l3_master(net, oif)) {
-			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
-					   FLOWI_FLAG_SKIP_NH_OIF;
-		}
-
 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 04529a3d42cb..54cf7197c7ab 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -818,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (rt6_need_strict(&fl6.daddr) && !oif)
 		fl6.flowi6_oif = tcp_v6_iif(skb);
-	else {
-		if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
-			oif = skb->skb_iif;
-
-		fl6.flowi6_oif = oif;
-	}
+	else
+		fl6.flowi6_oif = oif ? : skb->skb_iif;
 
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6cc97003e4a9..b7b7e863a2bb 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	nexthdr = nh[nhoff];
 
 	if (skb_dst(skb))
-		oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->flowi6_mark = skb->mark;
-- 
cgit v1.2.3


From 8ce48623f0cf3d632e32448411feddccb693d351 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 12 Oct 2016 19:01:45 +0200
Subject: ipv6: tcp: restore IP6CB for pktoptions skbs

Baozeng Ding reported following KASAN splat :

BUG: KASAN: use-after-free in ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 at addr ffff880029c84ec8
Read of size 1 by task poc/25548
Call Trace:
 [<ffffffff82cf43c9>] dump_stack+0x12e/0x185 /lib/dump_stack.c:15
 [<     inline     >] print_address_description /mm/kasan/report.c:204
 [<ffffffff817ced3b>] kasan_report_error+0x48b/0x4b0 /mm/kasan/report.c:283
 [<     inline     >] kasan_report /mm/kasan/report.c:303
 [<ffffffff817ced9e>] __asan_report_load1_noabort+0x3e/0x40 /mm/kasan/report.c:321
 [<ffffffff85c71da1>] ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 /net/ipv6/datagram.c:687
 [<ffffffff85c734c3>] ip6_datagram_recv_ctl+0x33/0x40
 [<ffffffff85c0b07c>] do_ipv6_getsockopt.isra.4+0xaec/0x2150
 [<ffffffff85c0c7f6>] ipv6_getsockopt+0x116/0x230
 [<ffffffff859b5a12>] tcp_getsockopt+0x82/0xd0 /net/ipv4/tcp.c:3035
 [<ffffffff855fb385>] sock_common_getsockopt+0x95/0xd0 /net/core/sock.c:2647
 [<     inline     >] SYSC_getsockopt /net/socket.c:1776
 [<ffffffff855f8ba2>] SyS_getsockopt+0x142/0x230 /net/socket.c:1758
 [<ffffffff8685cdc5>] entry_SYSCALL_64_fastpath+0x23/0xc6
Memory state around the buggy address:
 ffff880029c84d80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff880029c84e00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
> ffff880029c84e80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                                              ^
 ffff880029c84f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff880029c84f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

He also provided a syzkaller reproducer.

Issue is that ip6_datagram_recv_specific_ctl() expects to find IP6CB
data that was moved at a different place in tcp_v6_rcv()

This patch moves tcp_v6_restore_cb() up and calls it from
tcp_v6_do_rcv() when np->pktoptions is set.

Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54cf7197c7ab..5a27ab4eab39 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1190,6 +1190,16 @@ out:
 	return NULL;
 }
 
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+	/* We need to move header back to the beginning if xfrm6_policy_check()
+	 * and tcp_v6_fill_cb() are going to be called again.
+	 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+	 */
+	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+		sizeof(struct inet6_skb_parm));
+}
+
 /* The socket must have it's spinlock held when we get
  * here, unless it is a TCP_LISTEN socket.
  *
@@ -1319,6 +1329,7 @@ ipv6_pktoptions:
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
 			skb_set_owner_r(opt_skb, sk);
+			tcp_v6_restore_cb(opt_skb);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
 		} else {
 			__kfree_skb(opt_skb);
@@ -1352,15 +1363,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 	TCP_SKB_CB(skb)->sacked = 0;
 }
 
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
-	/* We need to move header back to the beginning if xfrm6_policy_check()
-	 * and tcp_v6_fill_cb() are going to be called again.
-	 */
-	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
-		sizeof(struct inet6_skb_parm));
-}
-
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
 	const struct tcphdr *th;
-- 
cgit v1.2.3