From c6cffba4ffa26a8ffacd0bb9f3144e34f20da7de Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 26 Jul 2012 11:14:38 +0000
Subject: ipv4: Fix input route performance regression.

With the routing cache removal we lost the "noref" code paths on
input, and this can kill some routing workloads.

Reinstate the noref path when we hit a cached route in the FIB
nexthops.

With help from Eric Dumazet.

Reported-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c           |  2 +-
 net/ipv4/fib_semantics.c |  4 ++--
 net/ipv4/ip_fragment.c   |  4 ++--
 net/ipv4/ip_input.c      |  4 ++--
 net/ipv4/route.c         | 48 ++++++++++++++++++++++--------------------------
 net/ipv4/xfrm4_input.c   |  4 ++--
 6 files changed, 31 insertions(+), 35 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a0124eb7dbea..77e87aff419a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e55171f184f9..da0cc2e6b250 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
 		if (nexthop_nh->nh_rth_output)
-			dst_release(&nexthop_nh->nh_rth_output->dst);
+			dst_free(&nexthop_nh->nh_rth_output->dst);
 		if (nexthop_nh->nh_rth_input)
-			dst_release(&nexthop_nh->nh_rth_input->dst);
+			dst_free(&nexthop_nh->nh_rth_input->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7ad88e5e7110..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
 		/* skb dst is stale, drop it, and perform route lookup again */
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
-		err = ip_route_input(head, iph->daddr, iph->saddr,
-				     iph->tos, head->dev);
+		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+					   iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 93134b0eab0c..bda8cac2ae91 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -339,8 +339,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_dst(skb)) {
-		int err = ip_route_input(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev);
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f7bb7185c50..fc1a81ca79a7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1199,10 +1199,9 @@ restart:
 	fnhe->fnhe_stamp = jiffies;
 }
 
-static inline void rt_release_rcu(struct rcu_head *head)
+static inline void rt_free(struct rtable *rt)
 {
-	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
-	dst_release(dst);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
 
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
@@ -1216,9 +1215,15 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 
 	prev = cmpxchg(p, orig, rt);
 	if (prev == orig) {
-		dst_clone(&rt->dst);
 		if (orig)
-			call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
+			rt_free(orig);
+	} else {
+		/* Routes we intend to cache in the FIB nexthop have
+		 * the DST_NOCACHE bit clear.  However, if we are
+		 * unsuccessful at storing this route into the cache
+		 * we really need to set it.
+		 */
+		rt->dst.flags |= DST_NOCACHE;
 	}
 }
 
@@ -1245,7 +1250,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		if (!(rt->dst.flags & DST_HOST))
+		if (!(rt->dst.flags & DST_NOCACHE))
 			rt_cache_route(nh, rt);
 	}
 
@@ -1261,7 +1266,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm, bool will_cache)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
-			 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
+			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1366,8 +1371,7 @@ static void ip_handle_martian_source(struct net_device *dev,
 static int __mkroute_input(struct sk_buff *skb,
 			   const struct fib_result *res,
 			   struct in_device *in_dev,
-			   __be32 daddr, __be32 saddr, u32 tos,
-			   struct rtable **result)
+			   __be32 daddr, __be32 saddr, u32 tos)
 {
 	struct rtable *rth;
 	int err;
@@ -1418,7 +1422,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		if (!itag) {
 			rth = FIB_RES_NH(*res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
-				dst_hold(&rth->dst);
+				skb_dst_set_noref(skb, &rth->dst);
 				goto out;
 			}
 			do_cache = true;
@@ -1445,8 +1449,8 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.output = ip_output;
 
 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+	skb_dst_set(skb, &rth->dst);
 out:
-	*result = rth;
 	err = 0;
  cleanup:
 	return err;
@@ -1458,21 +1462,13 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    struct in_device *in_dev,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
-	struct rtable *rth = NULL;
-	int err;
-
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
 		fib_select_multipath(res);
 #endif
 
 	/* create a routing cache entry */
-	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
-	if (err)
-		return err;
-
-	skb_dst_set(skb, &rth->dst);
-	return 0;
+	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }
 
 /*
@@ -1588,8 +1584,9 @@ local_input:
 		if (!itag) {
 			rth = FIB_RES_NH(res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
-				dst_hold(&rth->dst);
-				goto set_and_out;
+				skb_dst_set_noref(skb, &rth->dst);
+				err = 0;
+				goto out;
 			}
 			do_cache = true;
 		}
@@ -1620,7 +1617,6 @@ local_input:
 	}
 	if (do_cache)
 		rt_cache_route(&FIB_RES_NH(res), rth);
-set_and_out:
 	skb_dst_set(skb, &rth->dst);
 	err = 0;
 	goto out;
@@ -1658,8 +1654,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		   u8 tos, struct net_device *dev)
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			 u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1702,7 +1698,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input);
+EXPORT_SYMBOL(ip_route_input_noref);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 58d23a572509..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input(skb, iph->daddr, iph->saddr,
-				   iph->tos, skb->dev))
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.1


From c7109986db3c945f50ceed884a30e0fd8af3b89b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Jul 2012 12:18:11 +0000
Subject: ipv6: Early TCP socket demux

This is the IPv6 missing bits for infrastructure added in commit
41063e9dd1195 (ipv4: Early TCP socket demux.)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index bda8cac2ae91..981ff1eef28c 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -314,6 +314,7 @@ drop:
 }
 
 int sysctl_ip_early_demux __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_ip_early_demux);
 
 static int ip_rcv_finish(struct sk_buff *skb)
 {
-- 
cgit v1.2.1


From 42493570100b91ef663c4c6f0c0fdab238f9d3c2 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 26 Jul 2012 22:52:21 +0000
Subject: tcp: Add TCP_USER_TIMEOUT negative value check

TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int. But
patch "tcp: Add TCP_USER_TIMEOUT socket option"(dca43c75) didn't check the negative
values. If a user assign -1 to it, the socket will set successfully and wait
for 4294967295 miliseconds. This patch add a negative value check to avoid
this issue.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 581ecf02c6b5..e7e6eeae49c0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2681,7 +2681,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		/* Cap the max timeout in ms TCP will retry/retrans
 		 * before giving up and aborting (ETIMEDOUT) a connection.
 		 */
-		icsk->icsk_user_timeout = msecs_to_jiffies(val);
+		if (val < 0)
+			err = -EINVAL;
+		else
+			icsk->icsk_user_timeout = msecs_to_jiffies(val);
 		break;
 	default:
 		err = -ENOPROTOOPT;
-- 
cgit v1.2.1


From 505fbcf035c245a1a42cd80184feecf61ee868dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Jul 2012 06:23:40 +0000
Subject: ipv4: fix TCP early demux

commit 92101b3b2e317 (ipv4: Prepare for change of rt->rt_iif encoding.)
invalidated TCP early demux, because rx_dst_ifindex is not properly
initialized and checked.

Also remove the use of inet_iif(skb) in favor or skb->skb_iif

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c     |  1 +
 net/ipv4/tcp_ipv4.c      | 14 ++++++--------
 net/ipv4/tcp_minisocks.c |  1 +
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e07a64ca44e..aa659e825054 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5603,6 +5603,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	if (skb != NULL) {
 		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 		security_inet_conn_established(sk, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b6b07c93924c..2fbd9921253f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1620,17 +1620,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		sock_rps_save_rxhash(sk, skb);
 		if (sk->sk_rx_dst) {
 			struct dst_entry *dst = sk->sk_rx_dst;
-			if (dst->ops->check(dst, 0) == NULL) {
+			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+			    dst->ops->check(dst, 0) == NULL) {
 				dst_release(dst);
 				sk->sk_rx_dst = NULL;
 			}
 		}
 		if (unlikely(sk->sk_rx_dst == NULL)) {
-			struct inet_sock *icsk = inet_sk(sk);
-			struct rtable *rt = skb_rtable(skb);
-
-			sk->sk_rx_dst = dst_clone(&rt->dst);
-			icsk->rx_dst_ifindex = inet_iif(skb);
+			sk->sk_rx_dst = dst_clone(skb_dst(skb));
+			inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 		}
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
@@ -1709,11 +1707,11 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 		skb->destructor = sock_edemux;
 		if (sk->sk_state != TCP_TIME_WAIT) {
 			struct dst_entry *dst = sk->sk_rx_dst;
-			struct inet_sock *icsk = inet_sk(sk);
+
 			if (dst)
 				dst = dst_check(dst, 0);
 			if (dst &&
-			    icsk->rx_dst_ifindex == skb->skb_iif)
+			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
 				skb_dst_set_noref(skb, dst);
 		}
 	}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 5912ac3fd240..3f1cc2028edd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -388,6 +388,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
 		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+		inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
 
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
-- 
cgit v1.2.1


From 59ea33a68a9083ac98515e4861c00e71efdc49a1 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 27 Jul 2012 10:38:50 +0000
Subject: tcp: perform DMA to userspace only if there is a task waiting for it

Back in 2006, commit 1a2449a87b ("[I/OAT]: TCP recv offload to I/OAT")
added support for receive offloading to IOAT dma engine if available.

The code in tcp_rcv_established() tries to perform early DMA copy if
applicable. It however does so without checking whether the userspace
task is actually expecting the data in the buffer.

This is not a problem under normal circumstances, but there is a corner
case where this doesn't work -- and that's when MSG_TRUNC flag to
recvmsg() is used.

If the IOAT dma engine is not used, the code properly checks whether
there is a valid ucopy.task and the socket is owned by userspace, but
misses the check in the dmaengine case.

This problem can be observed in real trivially -- for example 'tbench' is a
good reproducer, as it makes a heavy use of MSG_TRUNC. On systems utilizing
IOAT, you will soon find tbench waiting indefinitely in sk_wait_data(), as they
have been already early-copied in tcp_rcv_established() using dma engine.

This patch introduces the same check we are performing in the simple
iovec copy case to the IOAT case as well. It fixes the indefinite
recvmsg(MSG_TRUNC) hangs.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aa659e825054..a356e1fecf9a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5475,7 +5475,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len) {
 #ifdef CONFIG_NET_DMA
-				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+				if (tp->ucopy.task == current &&
+				    sock_owned_by_user(sk) &&
+				    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
 					eaten = 1;
 				}
-- 
cgit v1.2.1


From 4ea4bf7ebcbacee2f4736d261efb0693e87a34c9 Mon Sep 17 00:00:00 2001
From: Lin Ming <mlin@ss.pku.edu.cn>
Date: Sun, 29 Jul 2012 01:19:55 +0000
Subject: ipv4: fix debug info in tnode_new

It should print size of struct rt_trie_node * allocated instead of size
of struct rt_trie_node.

Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18cbc15b20d5..2a6fdc2708c6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -473,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 	}
 
 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
-		 sizeof(struct rt_trie_node) << bits);
+		 sizeof(struct rt_trie_node *) << bits);
 	return tn;
 }
 
-- 
cgit v1.2.1


From 61648d91fc278fd1d500da8061d17e6920cd3500 Mon Sep 17 00:00:00 2001
From: Lin Ming <mlin@ss.pku.edu.cn>
Date: Sun, 29 Jul 2012 02:00:03 +0000
Subject: ipv4: clean up put_child

The first parameter struct trie *t is not used anymore.
Remove it.

Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2a6fdc2708c6..f0cdb30921c0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -159,7 +159,6 @@ struct trie {
 #endif
 };
 
-static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 				  int wasfull);
 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 }
 
-static inline void put_child(struct trie *t, struct tnode *tn, int i,
+static inline void put_child(struct tnode *tn, int i,
 			     struct rt_trie_node *n)
 {
 	tnode_put_child_reorg(tn, i, n, -1);
@@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 				goto nomem;
 			}
 
-			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
-			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
+			put_child(tn, 2*i, (struct rt_trie_node *) left);
+			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
 		}
 	}
 
@@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 			if (tkey_extract_bits(node->key,
 					      oldtnode->pos + oldtnode->bits,
 					      1) == 0)
-				put_child(t, tn, 2*i, node);
+				put_child(tn, 2*i, node);
 			else
-				put_child(t, tn, 2*i+1, node);
+				put_child(tn, 2*i+1, node);
 			continue;
 		}
 
@@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 		inode = (struct tnode *) node;
 
 		if (inode->bits == 1) {
-			put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
-			put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
+			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
+			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 
 			tnode_free_safe(inode);
 			continue;
@@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 		 */
 
 		left = (struct tnode *) tnode_get_child(tn, 2*i);
-		put_child(t, tn, 2*i, NULL);
+		put_child(tn, 2*i, NULL);
 
 		BUG_ON(!left);
 
 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-		put_child(t, tn, 2*i+1, NULL);
+		put_child(tn, 2*i+1, NULL);
 
 		BUG_ON(!right);
 
 		size = tnode_child_length(left);
 		for (j = 0; j < size; j++) {
-			put_child(t, left, j, rtnl_dereference(inode->child[j]));
-			put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
+			put_child(left, j, rtnl_dereference(inode->child[j]));
+			put_child(right, j, rtnl_dereference(inode->child[j + size]));
 		}
-		put_child(t, tn, 2*i, resize(t, left));
-		put_child(t, tn, 2*i+1, resize(t, right));
+		put_child(tn, 2*i, resize(t, left));
+		put_child(tn, 2*i+1, resize(t, right));
 
 		tnode_free_safe(inode);
 	}
@@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 			if (!newn)
 				goto nomem;
 
-			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
+			put_child(tn, i/2, (struct rt_trie_node *)newn);
 		}
 
 	}
@@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 		if (left == NULL) {
 			if (right == NULL)    /* Both are empty */
 				continue;
-			put_child(t, tn, i/2, right);
+			put_child(tn, i/2, right);
 			continue;
 		}
 
 		if (right == NULL) {
-			put_child(t, tn, i/2, left);
+			put_child(tn, i/2, left);
 			continue;
 		}
 
 		/* Two nonempty children */
 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
-		put_child(t, tn, i/2, NULL);
-		put_child(t, newBinNode, 0, left);
-		put_child(t, newBinNode, 1, right);
-		put_child(t, tn, i/2, resize(t, newBinNode));
+		put_child(tn, i/2, NULL);
+		put_child(newBinNode, 0, left);
+		put_child(newBinNode, 1, right);
+		put_child(tn, i/2, resize(t, newBinNode));
 	}
 	tnode_free_safe(oldtnode);
 	return tn;
@@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		node_set_parent((struct rt_trie_node *)l, tp);
 
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-		put_child(t, tp, cindex, (struct rt_trie_node *)l);
+		put_child(tp, cindex, (struct rt_trie_node *)l);
 	} else {
 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 		/*
@@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		node_set_parent((struct rt_trie_node *)tn, tp);
 
 		missbit = tkey_extract_bits(key, newpos, 1);
-		put_child(t, tn, missbit, (struct rt_trie_node *)l);
-		put_child(t, tn, 1-missbit, n);
+		put_child(tn, missbit, (struct rt_trie_node *)l);
+		put_child(tn, 1-missbit, n);
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(t, tp, cindex, (struct rt_trie_node *)tn);
+			put_child(tp, cindex, (struct rt_trie_node *)tn);
 		} else {
 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
 			tp = tn;
@@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
 
 	if (tp) {
 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
-		put_child(t, tp, cindex, NULL);
+		put_child(tp, cindex, NULL);
 		trie_rebalance(t, tp);
 	} else
 		RCU_INIT_POINTER(t->trie, NULL);
-- 
cgit v1.2.1


From cca32e4bf999a34ac08d959f351f2b30bcd02460 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Jul 2012 21:06:13 +0000
Subject: net: TCP early demux cleanup

early_demux() handlers should be called in RCU context, and as we
use skb_dst_set_noref(skb, dst), caller must not exit from RCU context
before dst use (skb_dst(skb)) or release (skb_drop(dst))

Therefore, rcu_read_lock()/rcu_read_unlock() pairs around
->early_demux() are confusing and not needed :

Protocol handlers are already in an RCU read lock section.
(__netif_receive_skb() does the rcu_read_lock() )

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 981ff1eef28c..f1395a6fb35f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
 		const struct net_protocol *ipprot;
 		int protocol = iph->protocol;
 
-		rcu_read_lock();
 		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot && ipprot->early_demux) {
 			ipprot->early_demux(skb);
 			/* must reload iph, skb->head might have changed */
 			iph = ip_hdr(skb);
 		}
-		rcu_read_unlock();
 	}
 
 	/*
-- 
cgit v1.2.1


From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Jul 2012 23:20:37 +0000
Subject: net: ipv4: fix RCU races on dst refcounts

commit c6cffba4ffa2 (ipv4: Fix input route performance regression.)
added various fatal races with dst refcounts.

crashes happen on tcp workloads if routes are added/deleted at the same
time.

The dst_free() calls from free_fib_info_rcu() are clearly racy.

We need instead regular dst refcounting (dst_release()) and make
sure dst_release() is aware of RCU grace periods :

Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period
before dst destruction for cached dst

Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero()
to make sure we dont increase a zero refcount (On a dst currently
waiting an rcu grace period before destruction)

rt_cache_route() must take a reference on the new cached route, and
release it if was not able to install it.

With this patch, my machines survive various benchmarks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c |  4 ++--
 net/ipv4/route.c         | 16 ++++------------
 net/ipv4/tcp_input.c     |  3 +--
 net/ipv4/tcp_ipv4.c      | 12 ++++++------
 net/ipv4/tcp_minisocks.c |  3 +--
 5 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da0cc2e6b250..e55171f184f9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
 		if (nexthop_nh->nh_rth_output)
-			dst_free(&nexthop_nh->nh_rth_output->dst);
+			dst_release(&nexthop_nh->nh_rth_output->dst);
 		if (nexthop_nh->nh_rth_input)
-			dst_free(&nexthop_nh->nh_rth_input->dst);
+			dst_release(&nexthop_nh->nh_rth_input->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fc1a81ca79a7..d6eabcfe8a90 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1199,11 +1199,6 @@ restart:
 	fnhe->fnhe_stamp = jiffies;
 }
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
 	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
@@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 
 	orig = *p;
 
+	rt->dst.flags |= DST_RCU_FREE;
+	dst_hold(&rt->dst);
 	prev = cmpxchg(p, orig, rt);
 	if (prev == orig) {
 		if (orig)
-			rt_free(orig);
+			dst_release(&orig->dst);
 	} else {
-		/* Routes we intend to cache in the FIB nexthop have
-		 * the DST_NOCACHE bit clear.  However, if we are
-		 * unsuccessful at storing this route into the cache
-		 * we really need to set it.
-		 */
-		rt->dst.flags |= DST_NOCACHE;
+		dst_release(&rt->dst);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9a..9be30b039ae3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
 	if (skb != NULL) {
-		sk->sk_rx_dst = dst_clone(skb_dst(skb));
-		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+		inet_sk_rx_dst_set(sk, skb);
 		security_inet_conn_established(sk, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2fbd9921253f..7f91e5ac8277 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 #endif
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		struct dst_entry *dst = sk->sk_rx_dst;
+
 		sock_rps_save_rxhash(sk, skb);
-		if (sk->sk_rx_dst) {
-			struct dst_entry *dst = sk->sk_rx_dst;
+		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    dst->ops->check(dst, 0) == NULL) {
 				dst_release(dst);
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		if (unlikely(sk->sk_rx_dst == NULL)) {
-			sk->sk_rx_dst = dst_clone(skb_dst(skb));
-			inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-		}
+		if (unlikely(sk->sk_rx_dst == NULL))
+			inet_sk_rx_dst_set(sk, skb);
+
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
 			goto reset;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3f1cc2028edd..232a90c3ec86 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
-		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
-		inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
+		inet_sk_rx_dst_set(newsk, skb);
 
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
-- 
cgit v1.2.1


From 0c7462a2351b4cc502f326aad7fedd04909928be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jul 2012 07:14:29 +0000
Subject: ipv4: remove rt_cache_rebuild_count

After IP route cache removal, rt_cache_rebuild_count is no longer
used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5840c3255721..4b6487a68279 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -783,13 +783,6 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "rt_cache_rebuild_count",
-		.data		= &init_net.ipv4.sysctl_rt_cache_rebuild_count,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "ping_group_range",
 		.data		= &init_net.ipv4.sysctl_ping_group_range,
@@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 		table[5].data =
 			&net->ipv4.sysctl_icmp_ratemask;
 		table[6].data =
-			&net->ipv4.sysctl_rt_cache_rebuild_count;
-		table[7].data =
 			&net->ipv4.sysctl_ping_group_range;
 
 	}
@@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	net->ipv4.sysctl_ping_group_range[0] = 1;
 	net->ipv4.sysctl_ping_group_range[1] = 0;
 
-	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
-
 	tcp_init_mem(net);
 
 	net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
-- 
cgit v1.2.1


From 54764bb647b2e847c512acf8d443df965da35000 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 31 Jul 2012 01:08:23 +0000
Subject: ipv4: Restore old dst_free() behavior.

commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried
to solve a race but added a problem at device/fib dismantle time :

We really want to call dst_free() as soon as possible, even if sockets
still have dst in their cache.
dst_release() calls in free_fib_info_rcu() are not welcomed.

Root of the problem was that now we also cache output routes (in
nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in
rt_free(), because output route lookups are done in process context.

Based on feedback and initial patch from David Miller (adding another
call_rcu_bh() call in fib, but it appears it was not the right fix)

I left the inet_sk_rx_dst_set() helper and added __rcu attributes
to nh_rth_output and nh_rth_input to better document what is going on in
this code.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 21 +++++++++++++++++----
 net/ipv4/route.c         | 26 +++++++++++++++++---------
 2 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e55171f184f9..625cf185c489 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -161,6 +161,21 @@ static void free_nh_exceptions(struct fib_nh *nh)
 	kfree(hash);
 }
 
+static void rt_nexthop_free(struct rtable __rcu **rtp)
+{
+	struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+	if (!rt)
+		return;
+
+	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+	 * because we waited an RCU grace period before calling
+	 * free_fib_info_rcu()
+	 */
+
+	dst_free(&rt->dst);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -171,10 +186,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
-		if (nexthop_nh->nh_rth_output)
-			dst_release(&nexthop_nh->nh_rth_output->dst);
-		if (nexthop_nh->nh_rth_input)
-			dst_release(&nexthop_nh->nh_rth_input->dst);
+		rt_nexthop_free(&nexthop_nh->nh_rth_output);
+		rt_nexthop_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d6eabcfe8a90..2bd107477469 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1199,23 +1199,31 @@ restart:
 	fnhe->fnhe_stamp = jiffies;
 }
 
+static inline void rt_free(struct rtable *rt)
+{
+	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
-	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
+	struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output;
 
 	if (rt_is_input_route(rt))
-		p = &nh->nh_rth_input;
+		p = (struct rtable **)&nh->nh_rth_input;
 
 	orig = *p;
 
-	rt->dst.flags |= DST_RCU_FREE;
-	dst_hold(&rt->dst);
 	prev = cmpxchg(p, orig, rt);
 	if (prev == orig) {
 		if (orig)
-			dst_release(&orig->dst);
+			rt_free(orig);
 	} else {
-		dst_release(&rt->dst);
+		/* Routes we intend to cache in the FIB nexthop have
+		 * the DST_NOCACHE bit clear.  However, if we are
+		 * unsuccessful at storing this route into the cache
+		 * we really need to set it.
+		 */
+		rt->dst.flags |= DST_NOCACHE;
 	}
 }
 
@@ -1412,7 +1420,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	do_cache = false;
 	if (res->fi) {
 		if (!itag) {
-			rth = FIB_RES_NH(*res).nh_rth_input;
+			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
 			if (rt_cache_valid(rth)) {
 				skb_dst_set_noref(skb, &rth->dst);
 				goto out;
@@ -1574,7 +1582,7 @@ local_input:
 	do_cache = false;
 	if (res.fi) {
 		if (!itag) {
-			rth = FIB_RES_NH(res).nh_rth_input;
+			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
 			if (rt_cache_valid(rth)) {
 				skb_dst_set_noref(skb, &rth->dst);
 				err = 0;
@@ -1742,7 +1750,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	if (fi) {
 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
 		if (!fnhe) {
-			rth = FIB_RES_NH(*res).nh_rth_output;
+			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output);
 			if (rt_cache_valid(rth)) {
 				dst_hold(&rth->dst);
 				return rth;
-- 
cgit v1.2.1


From d26b3a7c4b3b26319f18bb645de93eba8f4bdcd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 31 Jul 2012 05:45:30 +0000
Subject: ipv4: percpu nh_rth_output cache

Input path is mostly run under RCU and doesnt touch dst refcnt

But output path on forwarding or UDP workloads hits
badly dst refcount, and we have lot of false sharing, for example
in ipv4_mtu() when reading rt->rt_pmtu

Using a percpu cache for nh_rth_output gives a nice performance
increase at a small cost.

24 udpflood test on my 24 cpu machine (dummy0 output device)
(each process sends 1.000.000 udp frames, 24 processes are started)

before : 5.24 s
after : 2.06 s
For reference, time on linux-3.5 : 6.60 s

Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 20 +++++++++++++++++++-
 net/ipv4/route.c         | 18 +++++++++++++-----
 2 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 625cf185c489..fe2ca02a1979 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -176,6 +176,23 @@ static void rt_nexthop_free(struct rtable __rcu **rtp)
 	dst_free(&rt->dst);
 }
 
+static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+	int cpu;
+
+	if (!rtp)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct rtable *rt;
+
+		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+		if (rt)
+			dst_free(&rt->dst);
+	}
+	free_percpu(rtp);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -186,7 +203,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
-		rt_nexthop_free(&nexthop_nh->nh_rth_output);
+		rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_nexthop_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
@@ -817,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
+		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
 	} endfor_nexthops(fi)
 
 	if (cfg->fc_mx) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2bd107477469..4f6276ce0af3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1206,11 +1206,15 @@ static inline void rt_free(struct rtable *rt)
 
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
-	struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output;
+	struct rtable *orig, *prev, **p;
 
-	if (rt_is_input_route(rt))
+	if (rt_is_input_route(rt)) {
 		p = (struct rtable **)&nh->nh_rth_input;
-
+	} else {
+		if (!nh->nh_pcpu_rth_output)
+			goto nocache;
+		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+	}
 	orig = *p;
 
 	prev = cmpxchg(p, orig, rt);
@@ -1223,6 +1227,7 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 		 * unsuccessful at storing this route into the cache
 		 * we really need to set it.
 		 */
+nocache:
 		rt->dst.flags |= DST_NOCACHE;
 	}
 }
@@ -1749,8 +1754,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	fnhe = NULL;
 	if (fi) {
 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
-		if (!fnhe) {
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output);
+		if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) {
+			struct rtable __rcu **prth;
+
+			prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
+			rth = rcu_dereference(*prth);
 			if (rt_cache_valid(rth)) {
 				dst_hold(&rth->dst);
 				return rth;
-- 
cgit v1.2.1


From c5038a8327b980a5b279fa193163c468011de009 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 31 Jul 2012 15:02:02 -0700
Subject: ipv4: Cache routes in nexthop exception entries.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c |  39 +++++++++---------
 net/ipv4/route.c         | 103 ++++++++++++++++++++++++++---------------------
 2 files changed, 78 insertions(+), 64 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index fe2ca02a1979..da80dc14cc76 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
 	},
 };
 
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+	struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+	if (!rt)
+		return;
+
+	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+	 * because we waited an RCU grace period before calling
+	 * free_fib_info_rcu()
+	 */
+
+	dst_free(&rt->dst);
+}
+
 static void free_nh_exceptions(struct fib_nh *nh)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh)
 			struct fib_nh_exception *next;
 			
 			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+			rt_fibinfo_free(&fnhe->fnhe_rth);
+
 			kfree(fnhe);
 
 			fnhe = next;
@@ -161,22 +179,7 @@ static void free_nh_exceptions(struct fib_nh *nh)
 	kfree(hash);
 }
 
-static void rt_nexthop_free(struct rtable __rcu **rtp)
-{
-	struct rtable *rt = rcu_dereference_protected(*rtp, 1);
-
-	if (!rt)
-		return;
-
-	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
-	 * because we waited an RCU grace period before calling
-	 * free_fib_info_rcu()
-	 */
-
-	dst_free(&rt->dst);
-}
-
-static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp)
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 {
 	int cpu;
 
@@ -203,8 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
-		rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output);
-		rt_nexthop_free(&nexthop_nh->nh_rth_input);
+		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4f6276ce0af3..b102eeb16e34 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -587,11 +587,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 		build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SEQLOCK(fnhe_seqlock);
+static inline void rt_free(struct rtable *rt)
+{
+	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
 
 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
+	struct rtable *orig;
 
 	oldest = rcu_dereference(hash->chain);
 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -599,6 +605,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 			oldest = fnhe;
 	}
+	orig = rcu_dereference(oldest->fnhe_rth);
+	if (orig) {
+		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
+		rt_free(orig);
+	}
 	return oldest;
 }
 
@@ -620,7 +631,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 	int depth;
 	u32 hval = fnhe_hashfun(daddr);
 
-	write_seqlock_bh(&fnhe_seqlock);
+	spin_lock_bh(&fnhe_lock);
 
 	hash = nh->nh_exceptions;
 	if (!hash) {
@@ -667,7 +678,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 	fnhe->fnhe_stamp = jiffies;
 
 out_unlock:
-	write_sequnlock_bh(&fnhe_seqlock);
+	spin_unlock_bh(&fnhe_lock);
 	return;
 }
 
@@ -1167,41 +1178,40 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			      __be32 daddr)
 {
-	__be32 fnhe_daddr, gw;
-	unsigned long expires;
-	unsigned int seq;
-	u32 pmtu;
-
-restart:
-	seq = read_seqbegin(&fnhe_seqlock);
-	fnhe_daddr = fnhe->fnhe_daddr;
-	gw = fnhe->fnhe_gw;
-	pmtu = fnhe->fnhe_pmtu;
-	expires = fnhe->fnhe_expires;
-	if (read_seqretry(&fnhe_seqlock, seq))
-		goto restart;
-
-	if (daddr != fnhe_daddr)
-		return;
+	spin_lock_bh(&fnhe_lock);
 
-	if (pmtu) {
-		unsigned long diff = expires - jiffies;
+	if (daddr == fnhe->fnhe_daddr) {
+		struct rtable *orig;
 
-		if (time_before(jiffies, expires)) {
-			rt->rt_pmtu = pmtu;
-			dst_set_expires(&rt->dst, diff);
+		if (fnhe->fnhe_pmtu) {
+			unsigned long expires = fnhe->fnhe_expires;
+			unsigned long diff = expires - jiffies;
+
+			if (time_before(jiffies, expires)) {
+				rt->rt_pmtu = fnhe->fnhe_pmtu;
+				dst_set_expires(&rt->dst, diff);
+			}
+		}
+		if (fnhe->fnhe_gw) {
+			rt->rt_flags |= RTCF_REDIRECTED;
+			rt->rt_gateway = fnhe->fnhe_gw;
 		}
-	}
-	if (gw) {
-		rt->rt_flags |= RTCF_REDIRECTED;
-		rt->rt_gateway = gw;
-	}
-	fnhe->fnhe_stamp = jiffies;
-}
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+		orig = rcu_dereference(fnhe->fnhe_rth);
+		rcu_assign_pointer(fnhe->fnhe_rth, rt);
+		if (orig)
+			rt_free(orig);
+
+		fnhe->fnhe_stamp = jiffies;
+	} else {
+		/* Routes we intend to cache in nexthop exception have
+		 * the DST_NOCACHE bit clear.  However, if we are
+		 * unsuccessful at storing this route into the cache
+		 * we really need to set it.
+		 */
+		rt->dst.flags |= DST_NOCACHE;
+	}
+	spin_unlock_bh(&fnhe_lock);
 }
 
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
@@ -1249,13 +1259,13 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = nh->nh_gw;
-		if (unlikely(fnhe))
-			rt_bind_exception(rt, fnhe, daddr);
 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		if (!(rt->dst.flags & DST_NOCACHE))
+		if (unlikely(fnhe))
+			rt_bind_exception(rt, fnhe, daddr);
+		else if (!(rt->dst.flags & DST_NOCACHE))
 			rt_cache_route(nh, rt);
 	}
 
@@ -1753,22 +1763,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	fnhe = NULL;
 	if (fi) {
-		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
-		if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) {
-			struct rtable __rcu **prth;
+		struct rtable __rcu **prth;
 
+		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+		if (fnhe)
+			prth = &fnhe->fnhe_rth;
+		else
 			prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
-			rth = rcu_dereference(*prth);
-			if (rt_cache_valid(rth)) {
-				dst_hold(&rth->dst);
-				return rth;
-			}
+		rth = rcu_dereference(*prth);
+		if (rt_cache_valid(rth)) {
+			dst_hold(&rth->dst);
+			return rth;
 		}
 	}
 	rth = rt_dst_alloc(dev_out,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
-			   fi && !fnhe);
+			   fi);
 	if (!rth)
 		return ERR_PTR(-ENOBUFS);
 
-- 
cgit v1.2.1


From caacf05e5ad1abf0a2864863da4e33024bc68ec6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 31 Jul 2012 15:06:50 -0700
Subject: ipv4: Properly purge netdev references on uncached routes.

When a device is unregistered, we have to purge all of the
references to it that may exist in the entire system.

If a route is uncached, we currently have no way of accomplishing
this.

So create a global list that is scanned when a network device goes
down.  This mirrors the logic in net/core/dst.c's dst_ifdown().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c |  1 +
 net/ipv4/route.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++---
 net/ipv4/xfrm4_policy.c |  1 +
 3 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8732cc7920ed..c43ae3fba792 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 
 	if (event == NETDEV_UNREGISTER) {
 		fib_disable_ip(dev, 2, -1);
+		rt_flush_dev(dev);
 		return NOTIFY_DONE;
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b102eeb16e34..c035251beb07 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -147,6 +147,7 @@ static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
+static void		ipv4_dst_destroy(struct dst_entry *dst);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			    int how)
@@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = {
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
 	.cow_metrics =		ipv4_cow_metrics,
+	.destroy =		ipv4_dst_destroy,
 	.ifdown =		ipv4_dst_ifdown,
 	.negative_advice =	ipv4_negative_advice,
 	.link_failure =		ipv4_link_failure,
@@ -1175,9 +1177,11 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 	return NULL;
 }
 
-static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			      __be32 daddr)
 {
+	bool ret = false;
+
 	spin_lock_bh(&fnhe_lock);
 
 	if (daddr == fnhe->fnhe_daddr) {
@@ -1203,6 +1207,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			rt_free(orig);
 
 		fnhe->fnhe_stamp = jiffies;
+		ret = true;
 	} else {
 		/* Routes we intend to cache in nexthop exception have
 		 * the DST_NOCACHE bit clear.  However, if we are
@@ -1212,11 +1217,14 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 		rt->dst.flags |= DST_NOCACHE;
 	}
 	spin_unlock_bh(&fnhe_lock);
+
+	return ret;
 }
 
-static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
 	struct rtable *orig, *prev, **p;
+	bool ret = true;
 
 	if (rt_is_input_route(rt)) {
 		p = (struct rtable **)&nh->nh_rth_input;
@@ -1239,6 +1247,48 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 		 */
 nocache:
 		rt->dst.flags |= DST_NOCACHE;
+		ret = false;
+	}
+
+	return ret;
+}
+
+static DEFINE_SPINLOCK(rt_uncached_lock);
+static LIST_HEAD(rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+	spin_lock_bh(&rt_uncached_lock);
+	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
+	spin_unlock_bh(&rt_uncached_lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *) dst;
+
+	if (dst->flags & DST_NOCACHE) {
+		spin_lock_bh(&rt_uncached_lock);
+		list_del(&rt->rt_uncached);
+		spin_unlock_bh(&rt_uncached_lock);
+	}
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+	if (!list_empty(&rt_uncached_list)) {
+		struct net *net = dev_net(dev);
+		struct rtable *rt;
+
+		spin_lock_bh(&rt_uncached_lock);
+		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
+			if (rt->dst.dev != dev)
+				continue;
+			rt->dst.dev = net->loopback_dev;
+			dev_hold(rt->dst.dev);
+			dev_put(dev);
+		}
+		spin_unlock_bh(&rt_uncached_lock);
 	}
 }
 
@@ -1254,6 +1304,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			   struct fib_nh_exception *fnhe,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
+	bool cached = false;
+
 	if (fi) {
 		struct fib_nh *nh = &FIB_RES_NH(*res);
 
@@ -1264,10 +1316,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
 		if (unlikely(fnhe))
-			rt_bind_exception(rt, fnhe, daddr);
+			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
-			rt_cache_route(nh, rt);
+			cached = rt_cache_route(nh, rt);
 	}
+	if (unlikely(!cached))
+		rt_add_uncached_list(rt);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1334,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	INIT_LIST_HEAD(&rth->rt_uncached);
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1459,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	INIT_LIST_HEAD(&rth->rt_uncached);
 
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
@@ -1625,6 +1681,7 @@ local_input:
 	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	INIT_LIST_HEAD(&rth->rt_uncached);
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
 		rth->dst.error= -err;
@@ -1792,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
+	INIT_LIST_HEAD(&rth->rt_uncached);
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
@@ -2071,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_type = ort->rt_type;
 		rt->rt_gateway = ort->rt_gateway;
 
+		INIT_LIST_HEAD(&rt->rt_uncached);
+
 		dst_free(new);
 	}
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c6281847f16a..681ea2f413e2 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_type = rt->rt_type;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
 
 	return 0;
 }
-- 
cgit v1.2.1


From c255a458055e459f65eb7b7f51dc5dbdd0caf1d8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Jul 2012 16:43:02 -0700
Subject: memcg: rename config variables

Sanity:

CONFIG_CGROUP_MEM_RES_CTLR -> CONFIG_MEMCG
CONFIG_CGROUP_MEM_RES_CTLR_SWAP -> CONFIG_MEMCG_SWAP
CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED -> CONFIG_MEMCG_SWAP_ENABLED
CONFIG_CGROUP_MEM_RES_CTLR_KMEM -> CONFIG_MEMCG_KMEM

[mhocko@suse.cz: fix missed bits]
Cc: Glauber Costa <glommer@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv4/Makefile          | 2 +-
 net/ipv4/sysctl_net_ipv4.c | 4 ++--
 net/ipv4/tcp_ipv4.c        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ae2ccf2890e4..15ca63ec604e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5840c3255721..ed7db3f1b6f2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	int ret;
 	unsigned long vec[3];
 	struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 	struct mem_cgroup *memcg;
 #endif
 
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(current);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2fbd9921253f..4bc8f6769b57 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2633,7 +2633,7 @@ struct proto tcp_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 	.init_cgroup		= tcp_init_cgroup,
 	.destroy_cgroup		= tcp_destroy_cgroup,
 	.proto_cgroup		= tcp_proto_cgroup,
-- 
cgit v1.2.1


From 99a1dec70d5acbd8c6b3928cdebb4a2d1da676c8 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 31 Jul 2012 16:44:14 -0700
Subject: net: introduce sk_gfp_atomic() to allow addition of GFP flags
 depending on the individual socket

Introduce sk_gfp_atomic(), this function allows to inject sock specific
flags to each sock related allocation.  It is only used on allocation
paths that may be required for writing pages back to network storage.

[davem@davemloft.net: Use sk_gfp_atomic only when necessary]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv4/tcp_output.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33cd065cfbd8..3f1bcff0b10b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2045,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 	if (unlikely(sk->sk_state == TCP_CLOSE))
 		return;
 
-	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
+	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+			   sk_gfp_atomic(sk, GFP_ATOMIC)))
 		tcp_check_probe_timer(sk);
 }
 
@@ -2666,7 +2667,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
 		s_data_desired = cvp->s_data_desired;
-	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
+	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
+			sk_gfp_atomic(sk, GFP_ATOMIC));
 	if (unlikely(!skb)) {
 		dst_release(dst);
 		return NULL;
@@ -3064,7 +3066,7 @@ void tcp_send_ack(struct sock *sk)
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
 	 */
-	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
 	if (buff == NULL) {
 		inet_csk_schedule_ack(sk);
 		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3079,7 +3081,7 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
-	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
 }
 
 /* This routine sends a packet with an out of date sequence
@@ -3099,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	struct sk_buff *skb;
 
 	/* We don't queue it, tcp_transmit_skb() sets ownership. */
-	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
 	if (skb == NULL)
 		return -1;
 
-- 
cgit v1.2.1


From c76562b6709fee5eff8a6a779be41c0bce661fd7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 31 Jul 2012 16:44:41 -0700
Subject: netvm: prevent a stream-specific deadlock

This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.

When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon.  In diskless systems this is not an option so if swap if
required then swapping over the network is considered.  The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.

The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option.  There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD.  However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern.  Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.

Patch 1 avoids a stream-specific deadlock that potentially affects TCP.

Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
	reserves.

Patch 3 adds three helpers for filesystems to handle swap cache pages.
	For example, page_file_mapping() returns page->mapping for
	file-backed pages and the address_space of the underlying
	swap file for swap cache pages.

Patch 4 adds two address_space_operations to allow a filesystem
	to pin all metadata relevant to a swapfile in memory. Upon
	successful activation, the swapfile is marked SWP_FILE and
	the address space operation ->direct_IO is used for writing
	and ->readpage for reading in swap pages.

Patch 5 notes that patch 3 is bolting
	filesystem-specific-swapfile-support onto the side and that
	the default handlers have different information to what
	is available to the filesystem. This patch refactors the
	code so that there are generic handlers for each of the new
	address_space operations.

Patch 6 adds an API to allow a vector of kernel addresses to be
	translated to struct pages and pinned for IO.

Patch 7 adds support for using highmem pages for swap by kmapping
	the pages before calling the direct_IO handler.

Patch 8 updates NFS to use the helpers from patch 3 where necessary.

Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.

Patch 10 implements the new swapfile-related address_space operations
	for NFS and teaches the direct IO handler how to manage
	kernel addresses.

Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
	where appropriate.

Patch 12 fixes a NULL pointer dereference that occurs when using
	swap-over-NFS.

With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem.  Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.

This patch: netvm: prevent a stream-specific deadlock

It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit.  This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.

Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit.  Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.

[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv4/tcp_input.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9a..00b91b4b8665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
 
-static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+				 unsigned int size)
 {
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, size)) {
+	    !sk_rmem_schedule(sk, skb, size)) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
 
-		if (!sk_rmem_schedule(sk, size)) {
+		if (!sk_rmem_schedule(sk, skb, size)) {
 			if (!tcp_prune_ofo_queue(sk))
 				return -1;
 
-			if (!sk_rmem_schedule(sk, size))
+			if (!sk_rmem_schedule(sk, skb, size))
 				return -1;
 		}
 	}
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 	TCP_ECN_check_ce(tp, skb);
 
-	if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
 		__kfree_skb(skb);
 		return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct tcphdr *th;
 	bool fragstolen;
 
-	if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
-		goto err;
-
 	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
 	if (!skb)
 		goto err;
 
+	if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+		goto err_free;
+
 	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
 	skb_reset_transport_header(skb);
 	memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0 &&
-			    tcp_try_rmem_schedule(sk, skb->truesize))
+			    tcp_try_rmem_schedule(sk, skb, skb->truesize))
 				goto drop;
 
 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
-- 
cgit v1.2.1


From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 30 Jul 2012 16:11:42 +0000
Subject: tcp: Apply device TSO segment limit earlier

Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to
limit the size of TSO skbs.  This avoids the need to fall back to
software GSO for local TCP senders.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c        |  4 +++-
 net/ipv4/tcp_cong.c   |  3 ++-
 net/ipv4/tcp_output.c | 21 ++++++++++++---------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e7e6eeae49c0..2109ff4a1daf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 			   old_size_goal + mss_now > xmit_size_goal)) {
 			xmit_size_goal = old_size_goal;
 		} else {
-			tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+			tp->xmit_size_goal_segs =
+				min_t(u16, xmit_size_goal / mss_now,
+				      sk->sk_gso_max_segs);
 			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
 		}
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4d4db16e336e..1432cdb0644c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 	left = tp->snd_cwnd - in_flight;
 	if (sk_can_gso(sk) &&
 	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
-	    left * tp->mss_cache < sk->sk_gso_max_size)
+	    left * tp->mss_cache < sk->sk_gso_max_size &&
+	    left < sk->sk_gso_max_segs)
 		return true;
 	return left <= tcp_max_tso_deferred_mss(tp);
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3f1bcff0b10b..a7b3ec9b6c3e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1522,21 +1522,21 @@ static void tcp_cwnd_validate(struct sock *sk)
  * when we would be allowed to send the split-due-to-Nagle skb fully.
  */
 static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
-					unsigned int mss_now, unsigned int cwnd)
+					unsigned int mss_now, unsigned int max_segs)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	u32 needed, window, cwnd_len;
+	u32 needed, window, max_len;
 
 	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
-	cwnd_len = mss_now * cwnd;
+	max_len = mss_now * max_segs;
 
-	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
-		return cwnd_len;
+	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+		return max_len;
 
 	needed = min(skb->len, window);
 
-	if (cwnd_len <= needed)
-		return cwnd_len;
+	if (max_len <= needed)
+		return max_len;
 
 	return needed - needed % mss_now;
 }
@@ -1765,7 +1765,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	limit = min(send_win, cong_win);
 
 	/* If a full-sized TSO skb can be sent, do it. */
-	if (limit >= sk->sk_gso_max_size)
+	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+			   sk->sk_gso_max_segs * tp->mss_cache))
 		goto send_now;
 
 	/* Middle in queue won't get any more data, full sendable already? */
@@ -1999,7 +2000,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
-						    cwnd_quota);
+						    min_t(unsigned int,
+							  cwnd_quota,
+							  sk->sk_gso_max_segs));
 
 		if (skb->len > limit &&
 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
-- 
cgit v1.2.1


From e33cdac014d50dd9753e1399ae8b0b5cd98d7aa0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 1 Aug 2012 23:23:40 +0000
Subject: ipv4: route.c cleanup

Remove unused includes after IP cache removal

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c035251beb07..e4ba974f143c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,7 +70,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -80,7 +79,6 @@
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
-#include <linux/workqueue.h>
 #include <linux/skbuff.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
@@ -88,11 +86,9 @@
 #include <linux/mroute.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
-#include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
-#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
-- 
cgit v1.2.1


From 40384999d142552b81aeb596c2ae663dd3ff60ce Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Aug 2012 21:06:50 +0000
Subject: ipv4: change inet_addr_hash()

Use net_hash_mix(net) instead of hash_ptr(net, 8), and use
hash_32() instead of using a serie of XOR

Define IN4_ADDR_HSIZE_SHIFT for clarity

__ip_dev_find() can perform the net_eq() call only if ifa_local
matches the key, to avoid unneeded dereferences.

remove inline attributes

# size net/ipv4/devinet.o.before net/ipv4/devinet.o
   text	   data	    bss	    dec	    hex	filename
  17471	   2545	   2048	  22064	   5630	net/ipv4/devinet.o.before
  17335	   2545	   2048	  21928	   55a8	net/ipv4/devinet.o

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44bf82e3aef7..adf273f8ad2e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -94,25 +94,22 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
 
-/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
- * value.  So if you change this define, make appropriate changes to
- * inet_addr_hash as well.
- */
-#define IN4_ADDR_HSIZE	256
+#define IN4_ADDR_HSIZE_SHIFT	8
+#define IN4_ADDR_HSIZE		(1U << IN4_ADDR_HSIZE_SHIFT)
+
 static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
 static DEFINE_SPINLOCK(inet_addr_hash_lock);
 
-static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+static u32 inet_addr_hash(struct net *net, __be32 addr)
 {
-	u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+	u32 val = (__force u32) addr ^ net_hash_mix(net);
 
-	return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
-		(IN4_ADDR_HSIZE - 1));
+	return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
 }
 
 static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
 {
-	unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+	u32 hash = inet_addr_hash(net, ifa->ifa_local);
 
 	spin_lock(&inet_addr_hash_lock);
 	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -136,18 +133,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
  */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	unsigned int hash = inet_addr_hash(net, addr);
+	u32 hash = inet_addr_hash(net, addr);
 	struct net_device *result = NULL;
 	struct in_ifaddr *ifa;
 	struct hlist_node *node;
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
-		struct net_device *dev = ifa->ifa_dev->dev;
-
-		if (!net_eq(dev_net(dev), net))
-			continue;
 		if (ifa->ifa_local == addr) {
+			struct net_device *dev = ifa->ifa_dev->dev;
+
+			if (!net_eq(dev_net(dev), net))
+				continue;
 			result = dev;
 			break;
 		}
@@ -182,10 +179,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 static void devinet_sysctl_register(struct in_device *idev);
 static void devinet_sysctl_unregister(struct in_device *idev);
 #else
-static inline void devinet_sysctl_register(struct in_device *idev)
+static void devinet_sysctl_register(struct in_device *idev)
 {
 }
-static inline void devinet_sysctl_unregister(struct in_device *idev)
+static void devinet_sysctl_unregister(struct in_device *idev)
 {
 }
 #endif
@@ -205,7 +202,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
 	kfree(ifa);
 }
 
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct in_ifaddr *ifa)
 {
 	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
@@ -659,7 +656,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
  *	Determine a default network mask, based on the IP address.
  */
 
-static inline int inet_abc_len(__be32 addr)
+static int inet_abc_len(__be32 addr)
 {
 	int rc = -1;	/* Something else, probably a multicast. */
 
@@ -1124,7 +1121,7 @@ skip:
 	}
 }
 
-static inline bool inetdev_valid_mtu(unsigned int mtu)
+static bool inetdev_valid_mtu(unsigned int mtu)
 {
 	return mtu >= 68;
 }
@@ -1239,7 +1236,7 @@ static struct notifier_block ip_netdev_notifier = {
 	.notifier_call = inetdev_event,
 };
 
-static inline size_t inet_nlmsg_size(void)
+static size_t inet_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
 	       + nla_total_size(4) /* IFA_ADDRESS */
-- 
cgit v1.2.1


From 9eb43e765368f835d92c93844ebce30da7efeb84 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Aug 2012 21:27:25 +0000
Subject: ipv4: Introduce IN_DEV_NET_ROUTE_LOCALNET

performance profiles show a high cost in the IN_DEV_ROUTE_LOCALNET()
call done in ip_route_input_slow(), because of multiple dereferences,
even if cache lines are clean and available in cpu caches.

Since we already have the 'net' pointer, introduce
IN_DEV_NET_ROUTE_LOCALNET() macro avoiding two dereferences
(dev_net(in_dev->dev))

Also change the tests to use IN_DEV_NET_ROUTE_LOCALNET() only if saddr
or/and daddr are loopback addresse.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e4ba974f143c..21ad369014c0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1587,11 +1587,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_zeronet(daddr))
 		goto martian_destination;
 
-	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
-		if (ipv4_is_loopback(daddr))
+	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+	 * and call it once if daddr or/and saddr are loopback addresses
+	 */
+	if (ipv4_is_loopback(daddr)) {
+		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
 			goto martian_destination;
-
-		if (ipv4_is_loopback(saddr))
+	} else if (ipv4_is_loopback(saddr)) {
+		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
 			goto martian_source;
 	}
 
-- 
cgit v1.2.1


From 8e7dfbc8d1ea9ca9058aa641a8fe795ebca320e2 Mon Sep 17 00:00:00 2001
From: Silviu-Mihai Popescu <silviupopescu1990@gmail.com>
Date: Sat, 4 Aug 2012 09:31:29 +0000
Subject: tcp_output: fix sparse warning for tcp_wfree

Fix sparse warning:
	* symbol 'tcp_wfree' was not declared. Should it be static?

Signed-off-by: Silviu-Mihai Popescu <silviupopescu1990@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a7b3ec9b6c3e..20dfd892c86f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -940,7 +940,7 @@ void __init tcp_tasklet_init(void)
  * We cant xmit new skbs from this context, as we might already
  * hold qdisc lock.
  */
-void tcp_wfree(struct sk_buff *skb)
+static void tcp_wfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.1


From 9871f1ad677d95ffeca80e2c21b70af9bfc9cc91 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Mon, 6 Aug 2012 03:55:29 +0000
Subject: ip: fix error handling in ip_finish_output2()

__neigh_create() returns either a pointer to struct neighbour or PTR_ERR().
But the caller expects it to return either a pointer or NULL.  Replace
the NULL check with IS_ERR() check.

The bug was introduced in a263b3093641fb1ec377582c90986a7fd0625184
("ipv4: Make neigh lookups directly in output packet path.").

Signed-off-by: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ba39a52d18c1..76dde25fb9a0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -197,7 +197,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
-	if (neigh) {
+	if (!IS_ERR(neigh)) {
 		int res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock_bh();
-- 
cgit v1.2.1


From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Aug 2012 05:09:33 +0000
Subject: net: ipv6: fix TCP early demux

IPv6 needs a cookie in dst_check() call.

We need to add rx_dst_cookie and provide a family independent
sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c     |  4 +++-
 net/ipv4/tcp_ipv4.c      | 13 ++++++++++---
 net/ipv4/tcp_minisocks.c |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2fd2bc9e3c64..85308b90df80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5392,6 +5392,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	if (unlikely(sk->sk_rx_dst == NULL))
+		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5605,7 +5607,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
 	if (skb != NULL) {
-		inet_sk_rx_dst_set(sk, skb);
+		icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
 		security_inet_conn_established(sk, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 42b2a6a73092..272241f16fcb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		if (unlikely(sk->sk_rx_dst == NULL))
-			inet_sk_rx_dst_set(sk, skb);
-
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
 			goto reset;
@@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_destructor= tcp_twsk_destructor,
 };
 
+static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	dst_hold(dst);
+	sk->sk_rx_dst = dst;
+	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+}
+
 const struct inet_connection_sock_af_ops ipv4_specific = {
 	.queue_xmit	   = ip_queue_xmit,
 	.send_check	   = tcp_v4_send_check,
 	.rebuild_header	   = inet_sk_rebuild_header,
+	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
 	.conn_request	   = tcp_v4_conn_request,
 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
 	.net_header_len	   = sizeof(struct iphdr),
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 232a90c3ec86..d9c9dcef2de3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,7 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
-		inet_sk_rx_dst_set(newsk, skb);
+		newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb);
 
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
-- 
cgit v1.2.1


From a9e050f4e7f9d36afe0dcc0bddba864ee442715e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 5 Aug 2012 22:34:50 +0000
Subject: net: tcp: GRO should be ECN friendly

While doing TCP ECN tests, I discovered GRO was reordering packets if it
receives one packet with CE set, while previous packets in same NAPI run
have ECT(0) for the same flow :

09:25:25.857620 IP (tos 0x2,ECT(0), ttl 64, id 27893, offset 0, flags
[DF], proto TCP (6), length 4396)
    172.30.42.19.54550 > 172.30.42.13.44139: Flags [.], seq
233801:238145, ack 1, win 115, options [nop,nop,TS val 3397779 ecr
1990627], length 4344

09:25:25.857626 IP (tos 0x3,CE, ttl 64, id 27892, offset 0, flags [DF],
proto TCP (6), length 1500)
    172.30.42.19.54550 > 172.30.42.13.44139: Flags [.], seq
232353:233801, ack 1, win 115, options [nop,nop,TS val 3397779 ecr
1990627], length 1448

09:25:25.857638 IP (tos 0x0, ttl 64, id 34581, offset 0, flags [DF],
proto TCP (6), length 64)
    172.30.42.13.44139 > 172.30.42.19.54550: Flags [.], cksum 0xac8f
(incorrect -> 0xca69), ack 232353, win 1271, options [nop,nop,TS val
1990627 ecr 3397779,nop,nop,sack 1 {233801:238145}], length 0

We have two problems here :

1) GRO reorders packets

  If NIC gave packet1, then packet2, which happen to be from "different
flows"  GRO feeds stack with packet2, then packet1. I have yet to
understand how to solve this problem.

2) GRO is not ECN friendly

Delivering packets out of order makes TCP stack not as fast as it could
be.

In this patch I suggest we make the tos test not part of the 'same_flow'
determination, but part of the 'should flush' logic

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fe4582ca969a..6681ccf5c3ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1364,7 +1364,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	if (*(u8 *)iph != 0x45)
 		goto out_unlock;
 
-	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
 		goto out_unlock;
 
 	id = ntohl(*(__be32 *)&iph->id);
@@ -1380,7 +1380,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 		iph2 = ip_hdr(p);
 
 		if ((iph->protocol ^ iph2->protocol) |
-		    (iph->tos ^ iph2->tos) |
 		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
 		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
 			NAPI_GRO_CB(p)->same_flow = 0;
@@ -1390,6 +1389,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 		/* All fields must match except length and checksum. */
 		NAPI_GRO_CB(p)->flush |=
 			(iph->ttl ^ iph2->ttl) |
+			(iph->tos ^ iph2->tos) |
 			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
 
 		NAPI_GRO_CB(p)->flush |= flush;
-- 
cgit v1.2.1


From aae06bf5f90403554f8d4ff83810a8281aef7f03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Aug 2012 11:04:43 +0000
Subject: tcp: ecn: dont delay ACKS after CE

While playing with CoDel and ECN marking, I discovered a
non optimal behavior of receiver of CE (Congestion Encountered)
segments.

In pathological cases, sender has reduced its cwnd to low values,
and receiver delays its ACK (by 40 ms).

While RFC 3168 6.1.3 (The TCP Receiver) doesn't explicitly recommend
to send immediate ACKS, we believe its better to not delay ACKS, because
a CE segment should give same signal than a dropped segment, and its
quite important to reduce RTT to give ECE/CWR signals as fast as
possible.

Note we already call tcp_enter_quickack_mode() from TCP_ECN_check_ce()
if we receive a retransmit, for the same reason.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2fd2bc9e3c64..fa2c2c2cac2d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
 			tcp_enter_quickack_mode((struct sock *)tp);
 		break;
 	case INET_ECN_CE:
-		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+			/* Better not delay acks, sender can have a very low cwnd */
+			tcp_enter_quickack_mode((struct sock *)tp);
+			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		}
 		/* fallinto */
 	default:
 		tp->ecn_flags |= TCP_ECN_SEEN;
-- 
cgit v1.2.1


From 79cda75a107da0d49732b5cb642b456264dd7e0e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Aug 2012 10:45:47 +0000
Subject: fib: use __fls() on non null argument

__fls(x) is a bit faster than fls(x), granted we know x is non null.

As Ben Hutchings pointed out, fls(x) = __fls(x) + 1

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f0cdb30921c0..f84a0e90d675 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1550,7 +1550,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		 * state.directly.
 		 */
 		if (pref_mismatch) {
-			int mp = KEYLENGTH - fls(pref_mismatch);
+			/* fls(x) = __fls(x) + 1 */
+			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
 
 			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
 				goto backtrace;
-- 
cgit v1.2.1


From 0c03eca3d995e73d691edea8c787e25929ec156d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Aug 2012 00:47:11 +0000
Subject: net: fib: fix incorrect call_rcu_bh()

After IP route cache removal, I believe rcu_bh() has very little use and
we should remove this RCU variant, since it adds some cycles in fast
path.

Anyway, the call_rcu_bh() use in fib_true is obviously wrong, since
some users only assert rcu_read_lock().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f0cdb30921c0..57bd978483e1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -367,7 +367,7 @@ static void __leaf_free_rcu(struct rcu_head *head)
 
 static inline void free_leaf(struct leaf *l)
 {
-	call_rcu_bh(&l->rcu, __leaf_free_rcu);
+	call_rcu(&l->rcu, __leaf_free_rcu);
 }
 
 static inline void free_leaf_info(struct leaf_info *leaf)
-- 
cgit v1.2.1


From 36471012e2ae28ca3178f84d4687a2d88a36593e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Aug 2012 11:19:13 +0200
Subject: tcp: must free metrics at net dismantle

We currently leak all tcp metrics at struct net dismantle time.

tcp_net_metrics_exit() frees the hash table, we must first
iterate it to free all metrics.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2288a6399e1e..0abe67bb4d3a 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -731,6 +731,18 @@ static int __net_init tcp_net_metrics_init(struct net *net)
 
 static void __net_exit tcp_net_metrics_exit(struct net *net)
 {
+	unsigned int i;
+
+	for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
+		struct tcp_metrics_block *tm, *next;
+
+		tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
+		while (tm) {
+			next = rcu_dereference_protected(tm->tcpm_next, 1);
+			kfree(tm);
+			tm = next;
+		}
+	}
 	kfree(net->ipv4.tcp_metrics_hash);
 }
 
-- 
cgit v1.2.1


From a399a8053164ec8bcb06fed52be9941a26ecde11 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Aug 2012 21:13:53 +0000
Subject: time: jiffies_delta_to_clock_t() helper to the rescue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Various /proc/net files sometimes report crazy timer values, expressed
in clock_t units.

This happens when an expired timer delta (expires - jiffies) is passed
to jiffies_to_clock_t().

This function has an overflow in :

return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);

commit cbbc719fccdb8cb (time: Change jiffies_to_clock_t() argument type
to unsigned long) only got around the problem.

As we cant output negative values in /proc/net/tcp without breaking
various tools, I suggest adding a jiffies_delta_to_clock_t() wrapper
that caps the negative delta to a 0 value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: hank <pyu@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c     |  7 +++++--
 net/ipv4/tcp_ipv4.c | 13 +++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6699f23e6f55..0b5580c69f2d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2435,6 +2435,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 		struct ip_mc_list *im = (struct ip_mc_list *)v;
 		struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
 		char   *querier;
+		long delta;
+
 #ifdef CONFIG_IP_MULTICAST
 		querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
 			  IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2448,11 +2450,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 				   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
 		}
 
+		delta = im->timer.expires - jiffies;
 		seq_printf(seq,
 			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
 			   im->multiaddr, im->users,
-			   im->tm_running, im->tm_running ?
-			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+			   im->tm_running,
+			   im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
 			   im->reporter);
 	}
 	return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 42b2a6a73092..c660d2c19a2b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2385,7 +2385,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 			 struct seq_file *f, int i, int uid, int *len)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
-	int ttd = req->expires - jiffies;
+	long delta = req->expires - jiffies;
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2397,7 +2397,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 		TCP_SYN_RECV,
 		0, 0, /* could print option size, but that is af dependent. */
 		1,    /* timers active (only the expire timer) */
-		jiffies_to_clock_t(ttd),
+		jiffies_delta_to_clock_t(delta),
 		req->retrans,
 		uid,
 		0,  /* non standard timer */
@@ -2448,7 +2448,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		tp->write_seq - tp->snd_una,
 		rx_queue,
 		timer_active,
-		jiffies_to_clock_t(timer_expires - jiffies),
+		jiffies_delta_to_clock_t(timer_expires - jiffies),
 		icsk->icsk_retransmits,
 		sock_i_uid(sk),
 		icsk->icsk_probes_out,
@@ -2467,10 +2467,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 {
 	__be32 dest, src;
 	__u16 destp, srcp;
-	int ttd = tw->tw_ttd - jiffies;
-
-	if (ttd < 0)
-		ttd = 0;
+	long delta = tw->tw_ttd - jiffies;
 
 	dest  = tw->tw_daddr;
 	src   = tw->tw_rcv_saddr;
@@ -2480,7 +2477,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
-		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
 		atomic_read(&tw->tw_refcnt), tw, len);
 }
 
-- 
cgit v1.2.1


From 1fb9489bf190ce2b3fc03891f3de4b2d30600e28 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 8 Aug 2012 21:53:36 +0000
Subject: net: Loopback ifindex is constant now

As pointed out, there are places, that access net->loopback_dev->ifindex
and after ifindex generation is made per-net this value becomes constant
equals 1. So go ahead and introduce the LOOPBACK_IFINDEX constant and use
it where appropriate.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c           | 2 +-
 net/ipv4/ipmr.c                   | 2 +-
 net/ipv4/netfilter/ipt_rpfilter.c | 2 +-
 net/ipv4/route.c                  | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c43ae3fba792..7f073a38c87d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -218,7 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	scope = RT_SCOPE_UNIVERSE;
 	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 		fl4.flowi4_oif = 0;
-		fl4.flowi4_iif = net->loopback_dev->ifindex;
+		fl4.flowi4_iif = LOOPBACK_IFINDEX;
 		fl4.daddr = ip_hdr(skb)->saddr;
 		fl4.saddr = 0;
 		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8eec8f4a0536..3a57570c8ee5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1798,7 +1798,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.flowi4_oif = (rt_is_output_route(rt) ?
 			       skb->dev->ifindex : 0),
 		.flowi4_iif = (rt_is_output_route(rt) ?
-			       net->loopback_dev->ifindex :
+			       LOOPBACK_IFINDEX :
 			       skb->dev->ifindex),
 		.flowi4_mark = skb->mark,
 	};
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 31371be8174b..c30130062cd6 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			return ipv4_is_local_multicast(iph->daddr) ^ invert;
 		flow.flowi4_iif = 0;
 	} else {
-		flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex;
+		flow.flowi4_iif = LOOPBACK_IFINDEX;
 	}
 
 	flow.daddr = iph->saddr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 21ad369014c0..c58137391a3d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1619,7 +1619,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	if (res.type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
-					  net->loopback_dev->ifindex,
+					  LOOPBACK_IFINDEX,
 					  dev, in_dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
@@ -1895,7 +1895,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 	orig_oif = fl4->flowi4_oif;
 
-	fl4->flowi4_iif = net->loopback_dev->ifindex;
+	fl4->flowi4_iif = LOOPBACK_IFINDEX;
 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1984,7 +1984,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		if (!fl4->daddr)
 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
 		dev_out = net->loopback_dev;
-		fl4->flowi4_oif = net->loopback_dev->ifindex;
+		fl4->flowi4_oif = LOOPBACK_IFINDEX;
 		res.type = RTN_LOCAL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
-- 
cgit v1.2.1


From 3a7c384ffd57ef5fbd95f48edaa2ca4eb3d9f2ee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Aug 2012 13:56:06 +0000
Subject: ipv4: tcp: unicast_sock should not land outside of TCP stack

commit be9f4a44e7d41cee (ipv4: tcp: remove per net tcp_sock) added a
selinux regression, reported and bisected by John Stultz

selinux_ip_postroute_compat() expect to find a valid sk->sk_security
pointer, but this field is NULL for unicast_sock

It turns out that unicast_sock are really temporary stuff to be able
to reuse  part of IP stack (ip_append_data()/ip_push_pending_frames())

Fact is that frames sent by ip_send_unicast_reply() should be orphaned
to not fool LSM.

Note IPv6 never had this problem, as tcp_v6_send_response() doesnt use a
fake socket at all. I'll probably implement tcp_v4_send_response() to
remove these unicast_sock in linux-3.7

Reported-by: John Stultz <johnstul@us.ibm.com>
Bisected-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 76dde25fb9a0..ec410e08b4b9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1536,6 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
 		nskb->ip_summed = CHECKSUM_NONE;
+		skb_orphan(nskb);
 		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 		ip_push_pending_frames(sk, &fl4);
 	}
-- 
cgit v1.2.1


From 63d02d157ec4124990258d66517b6c11fd6df0cf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 9 Aug 2012 14:11:00 +0000
Subject: net: tcp: ipv6_mapped needs sk_rx_dst_set method

commit 5d299f3d3c8a2fb (net: ipv6: fix TCP early demux) added a
regression for ipv6_mapped case.

[   67.422369] SELinux: initialized (dev autofs, type autofs), uses
genfs_contexts
[   67.449678] SELinux: initialized (dev autofs, type autofs), uses
genfs_contexts
[   92.631060] BUG: unable to handle kernel NULL pointer dereference at
(null)
[   92.631435] IP: [<          (null)>]           (null)
[   92.631645] PGD 0
[   92.631846] Oops: 0010 [#1] SMP
[   92.632095] Modules linked in: autofs4 sunrpc ipv6 dm_mirror
dm_region_hash dm_log dm_multipath dm_mod video sbs sbshc battery ac lp
parport sg snd_hda_intel snd_hda_codec snd_seq_oss snd_seq_midi_event
snd_seq snd_seq_device pcspkr snd_pcm_oss snd_mixer_oss snd_pcm
snd_timer serio_raw button floppy snd i2c_i801 i2c_core soundcore
snd_page_alloc shpchp ide_cd_mod cdrom microcode ehci_hcd ohci_hcd
uhci_hcd
[   92.634294] CPU 0
[   92.634294] Pid: 4469, comm: sendmail Not tainted 3.6.0-rc1 #3
[   92.634294] RIP: 0010:[<0000000000000000>]  [<          (null)>]
(null)
[   92.634294] RSP: 0018:ffff880245fc7cb0  EFLAGS: 00010282
[   92.634294] RAX: ffffffffa01985f0 RBX: ffff88024827ad00 RCX:
0000000000000000
[   92.634294] RDX: 0000000000000218 RSI: ffff880254735380 RDI:
ffff88024827ad00
[   92.634294] RBP: ffff880245fc7cc8 R08: 0000000000000001 R09:
0000000000000000
[   92.634294] R10: 0000000000000000 R11: ffff880245fc7bf8 R12:
ffff880254735380
[   92.634294] R13: ffff880254735380 R14: 0000000000000000 R15:
7fffffffffff0218
[   92.634294] FS:  00007f4516ccd6f0(0000) GS:ffff880256600000(0000)
knlGS:0000000000000000
[   92.634294] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   92.634294] CR2: 0000000000000000 CR3: 0000000245ed1000 CR4:
00000000000007f0
[   92.634294] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[   92.634294] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[   92.634294] Process sendmail (pid: 4469, threadinfo ffff880245fc6000,
task ffff880254b8cac0)
[   92.634294] Stack:
[   92.634294]  ffffffff813837a7 ffff88024827ad00 ffff880254b6b0e8
ffff880245fc7d68
[   92.634294]  ffffffff81385083 00000000001d2680 ffff8802547353a8
ffff880245fc7d18
[   92.634294]  ffffffff8105903a ffff88024827ad60 0000000000000002
00000000000000ff
[   92.634294] Call Trace:
[   92.634294]  [<ffffffff813837a7>] ? tcp_finish_connect+0x2c/0xfa
[   92.634294]  [<ffffffff81385083>] tcp_rcv_state_process+0x2b6/0x9c6
[   92.634294]  [<ffffffff8105903a>] ? sched_clock_cpu+0xc3/0xd1
[   92.634294]  [<ffffffff81059073>] ? local_clock+0x2b/0x3c
[   92.634294]  [<ffffffff8138caf3>] tcp_v4_do_rcv+0x63a/0x670
[   92.634294]  [<ffffffff8133278e>] release_sock+0x128/0x1bd
[   92.634294]  [<ffffffff8139f060>] __inet_stream_connect+0x1b1/0x352
[   92.634294]  [<ffffffff813325f5>] ? lock_sock_nested+0x74/0x7f
[   92.634294]  [<ffffffff8104b333>] ? wake_up_bit+0x25/0x25
[   92.634294]  [<ffffffff813325f5>] ? lock_sock_nested+0x74/0x7f
[   92.634294]  [<ffffffff8139f223>] ? inet_stream_connect+0x22/0x4b
[   92.634294]  [<ffffffff8139f234>] inet_stream_connect+0x33/0x4b
[   92.634294]  [<ffffffff8132e8cf>] sys_connect+0x78/0x9e
[   92.634294]  [<ffffffff813fd407>] ? sysret_check+0x1b/0x56
[   92.634294]  [<ffffffff81088503>] ? __audit_syscall_entry+0x195/0x1c8
[   92.634294]  [<ffffffff811cc26e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[   92.634294]  [<ffffffff813fd3e2>] system_call_fastpath+0x16/0x1b
[   92.634294] Code:  Bad RIP value.
[   92.634294] RIP  [<          (null)>]           (null)
[   92.634294]  RSP <ffff880245fc7cb0>
[   92.634294] CR2: 0000000000000000
[   92.648982] ---[ end trace 24e2bed94314c8d9 ]---
[   92.649146] Kernel panic - not syncing: Fatal exception in interrupt

Fix this using inet_sk_rx_dst_set(), and export this function in case
IPv6 is modular.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 272241f16fcb..767823764016 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1869,7 +1869,7 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_destructor= tcp_twsk_destructor,
 };
 
-static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
@@ -1877,6 +1877,7 @@ static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	sk->sk_rx_dst = dst;
 	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 }
+EXPORT_SYMBOL(inet_sk_rx_dst_set);
 
 const struct inet_connection_sock_af_ops ipv4_specific = {
 	.queue_xmit	   = ip_queue_xmit,
-- 
cgit v1.2.1


From 02b69cbdc2fb2e1bfbfd9ac0c246d7be1b08d3cd Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 9 Aug 2012 10:08:46 +0000
Subject: netfilter: nf_ct_sip: fix IPv6 address parsing

Within SIP messages IPv6 addresses are enclosed in square brackets in most
cases, with the exception of the "received=" header parameter. Currently
the helper fails to parse enclosed addresses.

This patch:

- changes the SIP address parsing function to enforce square brackets
  when required, and accept them when not required but present, as
  recommended by RFC 5118.

- adds a new SDP address parsing function that never accepts square
  brackets since SDP doesn't use them.

With these changes, the SIP helper correctly parses all test messages
from RFC 5118 (Session Initiation Protocol (SIP) Torture Test Messages
for Internet Protocol Version 6 (IPv6)).

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_sip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index ea4a23813d26..eef8f29e8bf8 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -173,7 +173,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		 * the reply. */
 		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
 					       "maddr=", &poff, &plen,
-					       &addr) > 0 &&
+					       &addr, true) > 0 &&
 		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
 		    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
 			buflen = sprintf(buffer, "%pI4",
@@ -187,7 +187,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		 * from which the server received the request. */
 		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
 					       "received=", &poff, &plen,
-					       &addr) > 0 &&
+					       &addr, false) > 0 &&
 		    addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
 		    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
 			buflen = sprintf(buffer, "%pI4",
-- 
cgit v1.2.1


From f22eb25cf5b1157b29ef88c793b71972efc47143 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 9 Aug 2012 10:08:47 +0000
Subject: netfilter: nf_nat_sip: fix via header translation with multiple
 parameters

Via-headers are parsed beginning at the first character after the Via-address.
When the address is translated first and its length decreases, the offset to
start parsing at is incorrect and header parameters might be missed.

Update the offset after translating the Via-address to fix this.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_sip.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index eef8f29e8bf8..4ad9cf173992 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
 				    hdr, NULL, &matchoff, &matchlen,
 				    &addr, &port) > 0) {
-		unsigned int matchend, poff, plen, buflen, n;
+		unsigned int olen, matchend, poff, plen, buflen, n;
 		char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
 
 		/* We're only interested in headers related to this
@@ -163,11 +163,12 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 				goto next;
 		}
 
+		olen = *datalen;
 		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
 			      &addr, port))
 			return NF_DROP;
 
-		matchend = matchoff + matchlen;
+		matchend = matchoff + matchlen + *datalen - olen;
 
 		/* The maddr= parameter (RFC 2361) specifies where to send
 		 * the reply. */
-- 
cgit v1.2.1


From b5ec8eeac46a99004c26791f70b15d001e970acf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 10 Aug 2012 02:22:47 +0000
Subject: ipv4: fix ip_send_skb()

ip_send_skb() can send orphaned skb, so we must pass the net pointer to
avoid possible NULL dereference in error path.

Bug added by commit 3a7c384ffd57 (ipv4: tcp: unicast_sock should not
land outside of TCP stack)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 5 ++---
 net/ipv4/udp.c       | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ec410e08b4b9..147ccc3e93db 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1366,9 +1366,8 @@ out:
 	return skb;
 }
 
-int ip_send_skb(struct sk_buff *skb)
+int ip_send_skb(struct net *net, struct sk_buff *skb)
 {
-	struct net *net = sock_net(skb->sk);
 	int err;
 
 	err = ip_local_out(skb);
@@ -1391,7 +1390,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
 		return 0;
 
 	/* Netfilter gets whole the not fragmented skb. */
-	return ip_send_skb(skb);
+	return ip_send_skb(sock_net(sk), skb);
 }
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b4c3582a991f..6f6d1aca3c3d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -758,7 +758,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 		uh->check = CSUM_MANGLED_0;
 
 send:
-	err = ip_send_skb(skb);
+	err = ip_send_skb(sock_net(sk), skb);
 	if (err) {
 		if (err == -ENOBUFS && !inet->recverr) {
 			UDP_INC_STATS_USER(sock_net(sk),
-- 
cgit v1.2.1


From 19e303d67dc2e68a7f14b0baf7949195d7327145 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 28 Jul 2012 14:45:50 +0000
Subject: netfilter: PTR_RET can be used

This quiets the coccinelle warnings:

net/bridge/netfilter/ebtable_filter.c:107:1-3: WARNING: PTR_RET can be used
net/bridge/netfilter/ebtable_nat.c:107:1-3: WARNING: PTR_RET can be used
net/ipv6/netfilter/ip6table_filter.c:65:1-3: WARNING: PTR_RET can be used
net/ipv6/netfilter/ip6table_mangle.c:100:1-3: WARNING: PTR_RET can be used
net/ipv6/netfilter/ip6table_raw.c:44:1-3: WARNING: PTR_RET can be used
net/ipv6/netfilter/ip6table_security.c:62:1-3: WARNING: PTR_RET can be used
net/ipv4/netfilter/iptable_filter.c:72:1-3: WARNING: PTR_RET can be used
net/ipv4/netfilter/iptable_mangle.c:107:1-3: WARNING: PTR_RET can be used
net/ipv4/netfilter/iptable_raw.c:51:1-3: WARNING: PTR_RET can be used
net/ipv4/netfilter/iptable_security.c:70:1-3: WARNING: PTR_RET can be used

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_filter.c   | 4 +---
 net/ipv4/netfilter/iptable_mangle.c   | 4 +---
 net/ipv4/netfilter/iptable_raw.c      | 4 +---
 net/ipv4/netfilter/iptable_security.c | 5 +----
 4 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 851acec852d2..d20cc374025c 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,9 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	net->ipv4.iptable_filter =
 		ipt_register_table(net, &packet_filter, repl);
 	kfree(repl);
-	if (IS_ERR(net->ipv4.iptable_filter))
-		return PTR_ERR(net->ipv4.iptable_filter);
-	return 0;
+	return PTR_RET(net->ipv4.iptable_filter);
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aef5d1fbe77d..f38b94288cf5 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,9 +104,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
 	net->ipv4.iptable_mangle =
 		ipt_register_table(net, &packet_mangler, repl);
 	kfree(repl);
-	if (IS_ERR(net->ipv4.iptable_mangle))
-		return PTR_ERR(net->ipv4.iptable_mangle);
-	return 0;
+	return PTR_RET(net->ipv4.iptable_mangle);
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 07fb710cd722..b21e2191e2f5 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
 	net->ipv4.iptable_raw =
 		ipt_register_table(net, &packet_raw, repl);
 	kfree(repl);
-	if (IS_ERR(net->ipv4.iptable_raw))
-		return PTR_ERR(net->ipv4.iptable_raw);
-	return 0;
+	return PTR_RET(net->ipv4.iptable_raw);
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index be45bdc4c602..b283d8e2601a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,10 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
 	net->ipv4.iptable_security =
 		ipt_register_table(net, &security_table, repl);
 	kfree(repl);
-	if (IS_ERR(net->ipv4.iptable_security))
-		return PTR_ERR(net->ipv4.iptable_security);
-
-	return 0;
+	return PTR_RET(net->ipv4.iptable_security);
 }
 
 static void __net_exit iptable_security_net_exit(struct net *net)
-- 
cgit v1.2.1


From 7bd86cc282a458b66c41e3f6676de6656c99b8db Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 12 Aug 2012 20:09:59 +0000
Subject: ipv4: Cache local output routes

Commit caacf05e5ad1abf causes big drop of UDP loop back performance.
The cause of the regression is that we do not cache the local output
routes. Each time we send a datagram from unconnected UDP socket,
the kernel allocates a dst_entry and adds it to the rt_uncached_list.
It creates lock contention on the rt_uncached_lock.

Reported-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e4ba974f143c..fd9ecb52c66b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2028,7 +2028,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		}
 		dev_out = net->loopback_dev;
 		fl4->flowi4_oif = dev_out->ifindex;
-		res.fi = NULL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
 	}
-- 
cgit v1.2.1


From ad5b310228da567e35a2ea5dcb2fd62e3a36654e Mon Sep 17 00:00:00 2001
From: Igor Maravic <igorm@etf.rs>
Date: Mon, 13 Aug 2012 10:26:08 +0200
Subject: net: ipv4: fib_trie: Don't unnecessarily search for already found fib
 leaf

We've already found leaf, don't search for it again. Same is for fib leaf info.

Signed-off-by: Igor Maravic <igorm@etf.rs>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f84a0e90d675..4587d344046d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1656,7 +1656,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	if (!l)
 		return -ESRCH;
 
-	fa_head = get_fa_head(l, plen);
+	li = find_leaf_info(l, plen);
+
+	if (!li)
+		return -ESRCH;
+
+	fa_head = &li->falh;
 	fa = fib_find_alias(fa_head, tos, 0);
 
 	if (!fa)
@@ -1692,9 +1697,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
-	l = fib_find_node(t, key);
-	li = find_leaf_info(l, plen);
-
 	list_del_rcu(&fa->fa_list);
 
 	if (!plen)
-- 
cgit v1.2.1


From a7cb5a49bf64ba64864ae16a6be028f8b0d3cc06 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 May 2012 01:10:10 -0600
Subject: userns: Print out socket uids in a user namespace aware fashion.

Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/ipv4/ping.c     | 4 +++-
 net/ipv4/raw.c      | 4 +++-
 net/ipv4/tcp_ipv4.c | 6 +++---
 net/ipv4/udp.c      | 4 +++-
 4 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 6232d476f37e..bee5eeb676f8 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -845,7 +845,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
-		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
 		atomic_read(&sp->sk_drops), len);
 }
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff0f071969ea..f2425785d40a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -992,7 +992,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 		i, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
-		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 42b2a6a73092..642be8a4c6a3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2382,7 +2382,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-			 struct seq_file *f, int i, int uid, int *len)
+			 struct seq_file *f, int i, kuid_t uid, int *len)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	int ttd = req->expires - jiffies;
@@ -2399,7 +2399,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 		1,    /* timers active (only the expire timer) */
 		jiffies_to_clock_t(ttd),
 		req->retrans,
-		uid,
+		from_kuid_munged(seq_user_ns(f), uid),
 		0,  /* non standard timer */
 		0, /* open_requests have no inode */
 		atomic_read(&sk->sk_refcnt),
@@ -2450,7 +2450,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		timer_active,
 		jiffies_to_clock_t(timer_expires - jiffies),
 		icsk->icsk_retransmits,
-		sock_i_uid(sk),
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
 		icsk->icsk_probes_out,
 		sock_i_ino(sk),
 		atomic_read(&sk->sk_refcnt), sk,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b4c3582a991f..53b89817c008 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2110,7 +2110,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
-		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
 		atomic_read(&sp->sk_drops), len);
 }
-- 
cgit v1.2.1


From 7064d16e162adf8199f0288b694e6af823ed5431 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 May 2012 10:34:21 -0600
Subject: userns: Use kgids for sysctl_ping_group_range

- Store sysctl_ping_group_range as a paire of kgid_t values
  instead of a pair of gid_t values.
- Move the kgid conversion work from ping_init_sock into ipv4_ping_group_range
- For invalid cases reset to the default disabled state.

With the kgid_t conversion made part of the original value sanitation
from userspace understand how the code will react becomes clearer
and it becomes possible to set the sysctl ping group range from
something other than the initial user namespace.

Cc: Vasiliy Kulikov <segoon@openwall.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/ipv4/ping.c            | 18 ++++++------------
 net/ipv4/sysctl_net_ipv4.c | 42 +++++++++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index bee5eeb676f8..8f3d05424a3e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -185,10 +185,10 @@ exit:
 	return sk;
 }
 
-static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
-					  gid_t *high)
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+					  kgid_t *high)
 {
-	gid_t *data = net->ipv4.sysctl_ping_group_range;
+	kgid_t *data = net->ipv4.sysctl_ping_group_range;
 	unsigned int seq;
 
 	do {
@@ -203,19 +203,13 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
 static int ping_init_sock(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
-	gid_t group = current_egid();
-	gid_t range[2];
+	kgid_t group = current_egid();
 	struct group_info *group_info = get_current_groups();
 	int i, j, count = group_info->ngroups;
 	kgid_t low, high;
 
-	inet_get_ping_group_range_net(net, range, range+1);
-	low = make_kgid(&init_user_ns, range[0]);
-	high = make_kgid(&init_user_ns, range[1]);
-	if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
-		return -EACCES;
-
-	if (range[0] <= group && group <= range[1])
+	inet_get_ping_group_range_net(net, &low, &high);
+	if (gid_lte(low, group) && gid_lte(group, high))
 		return 0;
 
 	for (i = 0; i < group_info->nblocks; i++) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b5ce96707a3..3e78c79b5586 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -76,9 +76,9 @@ static int ipv4_local_port_range(ctl_table *table, int write,
 }
 
 
-static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
 {
-	gid_t *data = table->data;
+	kgid_t *data = table->data;
 	unsigned int seq;
 	do {
 		seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -89,12 +89,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low,
 }
 
 /* Update system visible IP port range */
-static void set_ping_group_range(struct ctl_table *table, gid_t range[2])
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
 {
-	gid_t *data = table->data;
+	kgid_t *data = table->data;
 	write_seqlock(&sysctl_local_ports.lock);
-	data[0] = range[0];
-	data[1] = range[1];
+	data[0] = low;
+	data[1] = high;
 	write_sequnlock(&sysctl_local_ports.lock);
 }
 
@@ -103,21 +103,33 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
 				 void __user *buffer,
 				 size_t *lenp, loff_t *ppos)
 {
+	struct user_namespace *user_ns = current_user_ns();
 	int ret;
-	gid_t range[2];
+	gid_t urange[2];
+	kgid_t low, high;
 	ctl_table tmp = {
-		.data = &range,
-		.maxlen = sizeof(range),
+		.data = &urange,
+		.maxlen = sizeof(urange),
 		.mode = table->mode,
 		.extra1 = &ip_ping_group_range_min,
 		.extra2 = &ip_ping_group_range_max,
 	};
 
-	inet_get_ping_group_range_table(table, range, range + 1);
+	inet_get_ping_group_range_table(table, &low, &high);
+	urange[0] = from_kgid_munged(user_ns, low);
+	urange[1] = from_kgid_munged(user_ns, high);
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
-	if (write && ret == 0)
-		set_ping_group_range(table, range);
+	if (write && ret == 0) {
+		low = make_kgid(user_ns, urange[0]);
+		high = make_kgid(user_ns, urange[1]);
+		if (!gid_valid(low) || !gid_valid(high) ||
+		    (urange[1] < urange[0]) || gid_lt(high, low)) {
+			low = make_kgid(&init_user_ns, 1);
+			high = make_kgid(&init_user_ns, 0);
+		}
+		set_ping_group_range(table, low, high);
+	}
 
 	return ret;
 }
@@ -786,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = {
 	{
 		.procname	= "ping_group_range",
 		.data		= &init_net.ipv4.sysctl_ping_group_range,
-		.maxlen		= sizeof(init_net.ipv4.sysctl_ping_group_range),
+		.maxlen		= sizeof(gid_t)*2,
 		.mode		= 0644,
 		.proc_handler	= ipv4_ping_group_range,
 	},
@@ -830,8 +842,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	 * Sane defaults - nobody may create ping sockets.
 	 * Boot scripts should set this to distro-specific group.
 	 */
-	net->ipv4.sysctl_ping_group_range[0] = 1;
-	net->ipv4.sysctl_ping_group_range[1] = 0;
+	net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
+	net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
 
 	tcp_init_mem(net);
 
-- 
cgit v1.2.1


From d06ca9564350184a19b5aae9ac150f1b1306de29 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 May 2012 17:58:08 -0600
Subject: userns: Teach inet_diag to work with user namespaces

Compute the user namespace of the socket that we are replying to
and translate the kuids of reported sockets into that user namespace.

Cc: Andrew Vagin <avagin@openvz.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/ipv4/inet_diag.c | 21 +++++++++++++++------
 net/ipv4/udp_diag.c  |  5 ++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 570e61f9611f..8bc005b1435f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -69,6 +69,7 @@ static inline void inet_diag_unlock_handler(
 
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			      struct sk_buff *skb, struct inet_diag_req_v2 *req,
+			      struct user_namespace *user_ns,		      	
 			      u32 pid, u32 seq, u16 nlmsg_flags,
 			      const struct nlmsghdr *unlh)
 {
@@ -124,7 +125,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 #endif
 
-	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -199,11 +200,12 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
 
 static int inet_csk_diag_fill(struct sock *sk,
 			      struct sk_buff *skb, struct inet_diag_req_v2 *req,
+			      struct user_namespace *user_ns,
 			      u32 pid, u32 seq, u16 nlmsg_flags,
 			      const struct nlmsghdr *unlh)
 {
 	return inet_sk_diag_fill(sk, inet_csk(sk),
-			skb, req, pid, seq, nlmsg_flags, unlh);
+			skb, req, user_ns, pid, seq, nlmsg_flags, unlh);
 }
 
 static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
@@ -256,14 +258,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
-			struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags,
+			struct inet_diag_req_v2 *r,
+			struct user_namespace *user_ns,
+			u32 pid, u32 seq, u16 nlmsg_flags,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
 					   skb, r, pid, seq, nlmsg_flags,
 					   unlh);
-	return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh);
+	return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh);
 }
 
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -311,6 +315,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 	}
 
 	err = sk_diag_fill(sk, rep, req,
+			   sk_user_ns(NETLINK_CB(in_skb).ssk),
 			   NETLINK_CB(in_skb).pid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
@@ -551,6 +556,7 @@ static int inet_csk_diag_dump(struct sock *sk,
 		return 0;
 
 	return inet_csk_diag_fill(sk, skb, r,
+				  sk_user_ns(NETLINK_CB(cb->skb).ssk),
 				  NETLINK_CB(cb->skb).pid,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -591,7 +597,9 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
 }
 
 static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
-			      struct request_sock *req, u32 pid, u32 seq,
+			      struct request_sock *req,
+			      struct user_namespace *user_ns,
+			      u32 pid, u32 seq,
 			      const struct nlmsghdr *unlh)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -625,7 +633,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	r->idiag_expires = jiffies_to_msecs(tmo);
 	r->idiag_rqueue = 0;
 	r->idiag_wqueue = 0;
-	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = 0;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (r->idiag_family == AF_INET6) {
@@ -702,6 +710,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 			}
 
 			err = inet_diag_fill_req(skb, sk, req,
+					       sk_user_ns(NETLINK_CB(cb->skb).ssk),
 					       NETLINK_CB(cb->skb).pid,
 					       cb->nlh->nlmsg_seq, cb->nlh);
 			if (err < 0) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 16d0960062be..d2f336ea82ca 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid,
+	return inet_sk_diag_fill(sk, NULL, skb, req,
+			sk_user_ns(NETLINK_CB(cb->skb).ssk),
+			NETLINK_CB(cb->skb).pid,
 			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 
@@ -69,6 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 		goto out;
 
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
+			   sk_user_ns(NETLINK_CB(in_skb).ssk),
 			   NETLINK_CB(in_skb).pid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
-- 
cgit v1.2.1


From 1b3a6926fc3430116c97c333f7cf901b86779008 Mon Sep 17 00:00:00 2001
From: Razvan Ghitulete <rghitulete@ixiacom.com>
Date: Tue, 14 Aug 2012 16:30:20 +0300
Subject: net: remove wrong initialization for snd_wl1

The field tp->snd_wl1 is twice initialized, the second time
seems to be wrong as it may overwrite any update in tcp_ack.

Signed-off-by: Razvan Ghitulete <rghitulete@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fa2c2c2cac2d..c423317eb719 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5744,7 +5744,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		TCP_ECN_rcv_synack(tp, th);
 
-		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 		/* Ok.. it's good. Set up sequence numbers and
@@ -5757,7 +5757,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * never scaled.
 		 */
 		tp->snd_wnd = ntohs(th->window);
-		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
 		if (!tp->rx_opt.wscale_ok) {
 			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
-- 
cgit v1.2.1


From fae6ef87faeb8853896920c68ee703d715799d28 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 19 Aug 2012 03:30:38 +0000
Subject: net: tcp: move sk_rx_dst_set call after tcp_create_openreq_child()

This commit removes the sk_rx_dst_set calls from
tcp_create_openreq_child(), because at that point the icsk_af_ops
field of ipv6_mapped TCP sockets has not been set to its proper final
value.

Instead, to make sure we get the right sk_rx_dst_set variant
appropriate for the address family of the new connection, we have
tcp_v{4,6}_syn_recv_sock() directly call the appropriate function
shortly after the call to tcp_create_openreq_child() returns.

This also moves inet6_sk_rx_dst_set() to avoid a forward declaration
with the new approach.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reported-by: Artem Savkov <artem.savkov@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c      | 1 +
 net/ipv4/tcp_minisocks.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 767823764016..5bf2040b25b1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1462,6 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto exit_nonewsk;
 
 	newsk->sk_gso_type = SKB_GSO_TCPV4;
+	inet_sk_rx_dst_set(newsk, skb);
 
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d9c9dcef2de3..6ff7f10dce9d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
-		newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb);
-
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.
-- 
cgit v1.2.1


From 203b42f7317494ae5e5efc7be6fb7f29c927f102 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Aug 2012 13:18:23 -0700
Subject: workqueue: make deferrable delayed_work initializer names consistent

Initalizers for deferrable delayed_work are confused.

* __DEFERRED_WORK_INITIALIZER()
* DECLARE_DEFERRED_WORK()
* INIT_DELAYED_WORK_DEFERRABLE()

Rename them to

* __DEFERRABLE_WORK_INITIALIZER()
* DECLARE_DEFERRABLE_WORK()
* INIT_DEFERRABLE_WORK()

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 net/ipv4/inetpeer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e1e0a4e8fd34..7b55c8648d4b 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -194,7 +194,7 @@ void __init inet_initpeers(void)
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 			NULL);
 
-	INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
+	INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
-- 
cgit v1.2.1


From 144d56e91044181ec0ef67aeca91e9a8b5718348 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 20 Aug 2012 00:22:46 +0000
Subject: tcp: fix possible socket refcount problem

Commit 6f458dfb40 (tcp: improve latencies of timer triggered events)
added bug leading to following trace :

[ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000
[ 2866.131726]
[ 2866.132188] =========================
[ 2866.132281] [ BUG: held lock freed! ]
[ 2866.132281] 3.6.0-rc1+ #622 Not tainted
[ 2866.132281] -------------------------
[ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there!
[ 2866.132281]  (sk_lock-AF_INET-RPC){+.+...}, at: [<ffffffff81903619>] tcp_sendmsg+0x29/0xcc6
[ 2866.132281] 4 locks held by kworker/0:1/652:
[ 2866.132281]  #0:  (rpciod){.+.+.+}, at: [<ffffffff81083567>] process_one_work+0x1de/0x47f
[ 2866.132281]  #1:  ((&task->u.tk_work)){+.+.+.}, at: [<ffffffff81083567>] process_one_work+0x1de/0x47f
[ 2866.132281]  #2:  (sk_lock-AF_INET-RPC){+.+...}, at: [<ffffffff81903619>] tcp_sendmsg+0x29/0xcc6
[ 2866.132281]  #3:  (&icsk->icsk_retransmit_timer){+.-...}, at: [<ffffffff81078017>] run_timer_softirq+0x1ad/0x35f
[ 2866.132281]
[ 2866.132281] stack backtrace:
[ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622
[ 2866.132281] Call Trace:
[ 2866.132281]  <IRQ>  [<ffffffff810bc527>] debug_check_no_locks_freed+0x112/0x159
[ 2866.132281]  [<ffffffff818a0839>] ? __sk_free+0xfd/0x114
[ 2866.132281]  [<ffffffff811549fa>] kmem_cache_free+0x6b/0x13a
[ 2866.132281]  [<ffffffff818a0839>] __sk_free+0xfd/0x114
[ 2866.132281]  [<ffffffff818a08c0>] sk_free+0x1c/0x1e
[ 2866.132281]  [<ffffffff81911e1c>] tcp_write_timer+0x51/0x56
[ 2866.132281]  [<ffffffff81078082>] run_timer_softirq+0x218/0x35f
[ 2866.132281]  [<ffffffff81078017>] ? run_timer_softirq+0x1ad/0x35f
[ 2866.132281]  [<ffffffff810f5831>] ? rb_commit+0x58/0x85
[ 2866.132281]  [<ffffffff81911dcb>] ? tcp_write_timer_handler+0x148/0x148
[ 2866.132281]  [<ffffffff81070bd6>] __do_softirq+0xcb/0x1f9
[ 2866.132281]  [<ffffffff81a0a00c>] ? _raw_spin_unlock+0x29/0x2e
[ 2866.132281]  [<ffffffff81a1227c>] call_softirq+0x1c/0x30
[ 2866.132281]  [<ffffffff81039f38>] do_softirq+0x4a/0xa6
[ 2866.132281]  [<ffffffff81070f2b>] irq_exit+0x51/0xad
[ 2866.132281]  [<ffffffff81a129cd>] do_IRQ+0x9d/0xb4
[ 2866.132281]  [<ffffffff81a0a3ef>] common_interrupt+0x6f/0x6f
[ 2866.132281]  <EOI>  [<ffffffff8109d006>] ? sched_clock_cpu+0x58/0xd1
[ 2866.132281]  [<ffffffff81a0a172>] ? _raw_spin_unlock_irqrestore+0x4c/0x56
[ 2866.132281]  [<ffffffff81078692>] mod_timer+0x178/0x1a9
[ 2866.132281]  [<ffffffff818a00aa>] sk_reset_timer+0x19/0x26
[ 2866.132281]  [<ffffffff8190b2cc>] tcp_rearm_rto+0x99/0xa4
[ 2866.132281]  [<ffffffff8190dfba>] tcp_event_new_data_sent+0x6e/0x70
[ 2866.132281]  [<ffffffff8190f7ea>] tcp_write_xmit+0x7de/0x8e4
[ 2866.132281]  [<ffffffff818a565d>] ? __alloc_skb+0xa0/0x1a1
[ 2866.132281]  [<ffffffff8190f952>] __tcp_push_pending_frames+0x2e/0x8a
[ 2866.132281]  [<ffffffff81904122>] tcp_sendmsg+0xb32/0xcc6
[ 2866.132281]  [<ffffffff819229c2>] inet_sendmsg+0xaa/0xd5
[ 2866.132281]  [<ffffffff81922918>] ? inet_autobind+0x5f/0x5f
[ 2866.132281]  [<ffffffff810ee7f1>] ? trace_clock_local+0x9/0xb
[ 2866.132281]  [<ffffffff8189adab>] sock_sendmsg+0xa3/0xc4
[ 2866.132281]  [<ffffffff810f5de6>] ? rb_reserve_next_event+0x26f/0x2d5
[ 2866.132281]  [<ffffffff8103e6a9>] ? native_sched_clock+0x29/0x6f
[ 2866.132281]  [<ffffffff8103e6f8>] ? sched_clock+0x9/0xd
[ 2866.132281]  [<ffffffff810ee7f1>] ? trace_clock_local+0x9/0xb
[ 2866.132281]  [<ffffffff8189ae03>] kernel_sendmsg+0x37/0x43
[ 2866.132281]  [<ffffffff8199ce49>] xs_send_kvec+0x77/0x80
[ 2866.132281]  [<ffffffff8199cec1>] xs_sendpages+0x6f/0x1a0
[ 2866.132281]  [<ffffffff8107826d>] ? try_to_del_timer_sync+0x55/0x61
[ 2866.132281]  [<ffffffff8199d0d2>] xs_tcp_send_request+0x55/0xf1
[ 2866.132281]  [<ffffffff8199bb90>] xprt_transmit+0x89/0x1db
[ 2866.132281]  [<ffffffff81999bcd>] ? call_connect+0x3c/0x3c
[ 2866.132281]  [<ffffffff81999d92>] call_transmit+0x1c5/0x20e
[ 2866.132281]  [<ffffffff819a0d55>] __rpc_execute+0x6f/0x225
[ 2866.132281]  [<ffffffff81999bcd>] ? call_connect+0x3c/0x3c
[ 2866.132281]  [<ffffffff819a0f33>] rpc_async_schedule+0x28/0x34
[ 2866.132281]  [<ffffffff810835d6>] process_one_work+0x24d/0x47f
[ 2866.132281]  [<ffffffff81083567>] ? process_one_work+0x1de/0x47f
[ 2866.132281]  [<ffffffff819a0f0b>] ? __rpc_execute+0x225/0x225
[ 2866.132281]  [<ffffffff81083a6d>] worker_thread+0x236/0x317
[ 2866.132281]  [<ffffffff81083837>] ? process_scheduled_works+0x2f/0x2f
[ 2866.132281]  [<ffffffff8108b7b8>] kthread+0x9a/0xa2
[ 2866.132281]  [<ffffffff81a12184>] kernel_thread_helper+0x4/0x10
[ 2866.132281]  [<ffffffff81a0a4b0>] ? retint_restore_args+0x13/0x13
[ 2866.132281]  [<ffffffff8108b71e>] ? __init_kthread_worker+0x5a/0x5a
[ 2866.132281]  [<ffffffff81a12180>] ? gs_change+0x13/0x13
[ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000
[ 2866.309689] =============================================================================
[ 2866.310254] BUG TCP (Not tainted): Object already free
[ 2866.310254] -----------------------------------------------------------------------------
[ 2866.310254]

The bug comes from the fact that timer set in sk_reset_timer() can run
before we actually do the sock_hold(). socket refcount reaches zero and
we free the socket too soon.

timer handler is not allowed to reduce socket refcnt if socket is owned
by the user, or we need to change sk_reset_timer() implementation.

We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED
or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags

Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED
was used instead of TCP_DELACK_TIMER_DEFERRED.

For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED,
even if not fired from a timer.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c   |  8 +++++---
 net/ipv4/tcp_output.c | 14 +++++++++-----
 net/ipv4/tcp_timer.c  |  6 ++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5bf2040b25b1..00a748d14062 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 			tp->mtu_info = info;
-			if (!sock_owned_by_user(sk))
+			if (!sock_owned_by_user(sk)) {
 				tcp_v4_mtu_reduced(sk);
-			else
-				set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
+			} else {
+				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+					sock_hold(sk);
+			}
 			goto out;
 		}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20dfd892c86f..d04632673a9e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk)
 	if (flags & (1UL << TCP_TSQ_DEFERRED))
 		tcp_tsq_handler(sk);
 
-	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
 		tcp_write_timer_handler(sk);
-
-	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+		__sock_put(sk);
+	}
+	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
 		tcp_delack_timer_handler(sk);
-
-	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
+		__sock_put(sk);
+	}
+	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
 		sk->sk_prot->mtu_reduced(sk);
+		__sock_put(sk);
+	}
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6df36ad55a38..b774a03bd1dc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data)
 		inet_csk(sk)->icsk_ack.blocked = 1;
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
 		/* deleguate our work to tcp_release_cb() */
-		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
@@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data)
 		tcp_write_timer_handler(sk);
 	} else {
 		/* deleguate our work to tcp_release_cb() */
-		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
-- 
cgit v1.2.1


From 1a7b27c97ce675b42eeb7bfaf6e15c34f35c8f95 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Mon, 20 Aug 2012 02:52:09 +0000
Subject: ipv4: Use newinet->inet_opt in inet_csk_route_child_sock()

Since 0e734419923bd ("ipv4: Use inet_csk_route_child_sock() in DCCP and
TCP."), inet_csk_route_child_sock() is called instead of
inet_csk_route_req().

However, after creating the child-sock in tcp/dccp_v4_syn_recv_sock(),
ireq->opt is set to NULL, before calling inet_csk_route_child_sock().
Thus, inside inet_csk_route_child_sock() opt is always NULL and the
SRR-options are not respected anymore.
Packets sent by the server won't have the correct destination-IP.

This patch fixes it by accessing newinet->inet_opt instead of ireq->opt
inside inet_csk_route_child_sock().

Reported-by: Luca Boccassi <luca.boccassi@gmail.com>
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index db0cf17c00f7..7f75f21d7b83 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -404,12 +404,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct inet_sock *newinet = inet_sk(newsk);
-	struct ip_options_rcu *opt = ireq->opt;
+	struct ip_options_rcu *opt;
 	struct net *net = sock_net(sk);
 	struct flowi4 *fl4;
 	struct rtable *rt;
 
 	fl4 = &newinet->cork.fl.u.ip4;
+
+	rcu_read_lock();
+	opt = rcu_dereference(newinet->inet_opt);
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -421,11 +424,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 		goto no_route;
 	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
 		goto route_err;
+	rcu_read_unlock();
 	return &rt->dst;
 
 route_err:
 	ip_rt_put(rt);
 no_route:
+	rcu_read_unlock();
 	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
 	return NULL;
 }
-- 
cgit v1.2.1


From a9915a1b52df52ad87f3b33422da95cf25372f09 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 20 Aug 2012 07:26:45 +0000
Subject: ipv4: fix ip header ident selection in __ip_make_skb()

Christian Casteyde reported a kmemcheck 32-bit read from uninitialized
memory in __ip_select_ident().

It turns out that __ip_make_skb() called ip_select_ident() before
properly initializing iph->daddr.

This is a bug uncovered by commit 1d861aa4b3fb (inet: Minimize use of
cached route inetpeer.)

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=46131

Reported-by: Christian Casteyde <casteyde.christian@free.fr>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 147ccc3e93db..c196d749daf2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1338,10 +1338,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	iph->ihl = 5;
 	iph->tos = inet->tos;
 	iph->frag_off = df;
-	ip_select_ident(iph, &rt->dst, sk);
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
 	ip_copy_addrs(iph, fl4);
+	ip_select_ident(iph, &rt->dst, sk);
 
 	if (opt) {
 		iph->ihl += opt->optlen>>2;
-- 
cgit v1.2.1


From 90efbed18a30d78145419cdbd44f9ec152efeb16 Mon Sep 17 00:00:00 2001
From: Jean Sacren <sakiwit@gmail.com>
Date: Sun, 19 Aug 2012 15:11:32 +0000
Subject: netfilter: remove unnecessary goto statement for error recovery

Usually it's a good practice to use goto statement for error recovery
when initializing the module. This approach could be an overkill if:

 1) there is only one fail case;
 2) success and failure use the same return statement.

For a cleaner approach, remove the unnecessary goto statement and
directly implement error recovery.

Signed-off-by: Jean Sacren <sakiwit@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_filter.c | 6 +-----
 net/ipv4/netfilter/iptable_mangle.c | 6 +-----
 net/ipv4/netfilter/iptable_raw.c    | 6 +-----
 3 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index d20cc374025c..6b3da5cf54e9 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -94,14 +94,10 @@ static int __init iptable_filter_init(void)
 	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
 	if (IS_ERR(filter_ops)) {
 		ret = PTR_ERR(filter_ops);
-		goto cleanup_table;
+		unregister_pernet_subsys(&iptable_filter_net_ops);
 	}
 
 	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&iptable_filter_net_ops);
-	return ret;
 }
 
 static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index f38b94288cf5..85d88f206447 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -129,14 +129,10 @@ static int __init iptable_mangle_init(void)
 	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
 	if (IS_ERR(mangle_ops)) {
 		ret = PTR_ERR(mangle_ops);
-		goto cleanup_table;
+		unregister_pernet_subsys(&iptable_mangle_net_ops);
 	}
 
 	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&iptable_mangle_net_ops);
-	return ret;
 }
 
 static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index b21e2191e2f5..03d9696d3c6e 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -73,14 +73,10 @@ static int __init iptable_raw_init(void)
 	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
 	if (IS_ERR(rawtable_ops)) {
 		ret = PTR_ERR(rawtable_ops);
-		goto cleanup_table;
+		unregister_pernet_subsys(&iptable_raw_net_ops);
 	}
 
 	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&iptable_raw_net_ops);
-	return ret;
 }
 
 static void __exit iptable_raw_fini(void)
-- 
cgit v1.2.1


From 9b04f350057863d1fad1ba071e09362a1da3503e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Aug 2012 20:48:29 +0000
Subject: ipv4: properly update pmtu

Sylvain Munault reported following info :

 - TCP connection get "stuck" with data in send queue when doing
   "large" transfers ( like typing 'ps ax' on a ssh connection )
 - Only happens on path where the PMTU is lower than the MTU of
   the interface
 - Is not present right after boot, it only appears 10-20min after
   boot or so. (and that's inside the _same_ TCP connection, it works
   fine at first and then in the same ssh session, it'll get stuck)
 - Definitely seems related to fragments somehow since I see a router
   sending ICMP message saying fragmentation is needed.
 - Exact same setup works fine with kernel 3.5.1

Problem happens when the 10 minutes (ip_rt_mtu_expires) expiration
period is over.

ip_rt_update_pmtu() calls dst_set_expires() to rearm a new expiration,
but dst_set_expires() does nothing because dst.expires is already set.

It seems we want to set the expires field to a new value, regardless
of prior one.

With help from Julian Anastasov.

Reported-by: Sylvain Munaut <s.munaut@whatever-company.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
CC: Julian Anastasov <ja@ssi.bg>
Tested-by: Sylvain Munaut <s.munaut@whatever-company.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fd9ecb52c66b..8c8c748ebb28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -956,7 +956,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 		dst->obsolete = DST_OBSOLETE_KILL;
 	} else {
 		rt->rt_pmtu = mtu;
-		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+		rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
 	}
 }
 
-- 
cgit v1.2.1


From 0115e8e30d6fcdd4b8faa30d3ffd90859a591f51 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Aug 2012 17:19:46 +0000
Subject: net: remove delay at device dismantle

I noticed extra one second delay in device dismantle, tracked down to
a call to dst_dev_event() while some call_rcu() are still in RCU queues.

These call_rcu() were posted by rt_free(struct rtable *rt) calls.

We then wait a little (but one second) in netdev_wait_allrefs() before
kicking again NETDEV_UNREGISTER.

As the call_rcu() are now completed, dst_dev_event() can do the needed
device swap on busy dst.

To solve this problem, add a new NETDEV_UNREGISTER_FINAL, called
after a rcu_barrier(), but outside of RTNL lock.

Use NETDEV_UNREGISTER_FINAL with care !

Change dst_dev_event() handler to react to NETDEV_UNREGISTER_FINAL

Also remove NETDEV_UNREGISTER_BATCH, as its not used anymore after
IP cache removal.

With help from Gao feng

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c      | 6 +++++-
 net/ipv4/fib_frontend.c | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index adf273f8ad2e..6a5e6e4b142c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1147,8 +1147,12 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			 void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct in_device *in_dev;
 
+	if (event == NETDEV_UNREGISTER_FINAL)
+		goto out;
+
+	in_dev = __in_dev_get_rtnl(dev);
 	ASSERT_RTNL();
 
 	if (!in_dev) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7f073a38c87d..fd7d9ae64f16 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1041,7 +1041,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct in_device *in_dev;
 	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_UNREGISTER) {
@@ -1050,9 +1050,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		return NOTIFY_DONE;
 	}
 
-	if (!in_dev)
+	if (event == NETDEV_UNREGISTER_FINAL)
 		return NOTIFY_DONE;
 
+	in_dev = __in_dev_get_rtnl(dev);
+
 	switch (event) {
 	case NETDEV_UP:
 		for_ifa(in_dev) {
@@ -1071,8 +1073,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 	case NETDEV_CHANGE:
 		rt_cache_flush(dev_net(dev), 0);
 		break;
-	case NETDEV_UNREGISTER_BATCH:
-		break;
 	}
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.1


From 748e2d9396a18c3fd3d07d47c1b41320acf1fbf4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Aug 2012 21:50:59 +0000
Subject: net: reinstate rtnl in call_netdevice_notifiers()

Eric Biederman pointed out that not holding RTNL while calling
call_netdevice_notifiers() was racy.

This patch is a direct transcription his feedback
against commit 0115e8e30d6fc (net: remove delay at device dismantle)

Thanks Eric !

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c      | 6 +-----
 net/ipv4/fib_frontend.c | 7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6a5e6e4b142c..adf273f8ad2e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1147,12 +1147,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			 void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct in_device *in_dev;
-
-	if (event == NETDEV_UNREGISTER_FINAL)
-		goto out;
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 
-	in_dev = __in_dev_get_rtnl(dev);
 	ASSERT_RTNL();
 
 	if (!in_dev) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index fd7d9ae64f16..acdee325d972 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1050,9 +1050,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		return NOTIFY_DONE;
 	}
 
-	if (event == NETDEV_UNREGISTER_FINAL)
-		return NOTIFY_DONE;
-
 	in_dev = __in_dev_get_rtnl(dev);
 
 	switch (event) {
@@ -1064,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		fib_sync_up(dev);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
-		rt_cache_flush(dev_net(dev), -1);
+		rt_cache_flush(net, -1);
 		break;
 	case NETDEV_DOWN:
 		fib_disable_ip(dev, 0, 0);
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
-		rt_cache_flush(dev_net(dev), 0);
+		rt_cache_flush(net, 0);
 		break;
 	}
 	return NOTIFY_DONE;
-- 
cgit v1.2.1


From 78df76a065ae3b5dbcb9a29912adc02f697de498 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 Aug 2012 05:40:47 +0000
Subject: ipv4: take rt_uncached_lock only if needed

Multicast traffic allocates dst with DST_NOCACHE, but dst is
not inserted into rt_uncached_list.

This slowdown multicast workloads on SMP because rt_uncached_lock is
contended.

Change the test before taking the lock to actually check the dst
was inserted into rt_uncached_list.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8c8c748ebb28..24fd4c596643 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1263,7 +1263,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (dst->flags & DST_NOCACHE) {
+	if (!list_empty(&rt->rt_uncached)) {
 		spin_lock_bh(&rt_uncached_lock);
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&rt_uncached_lock);
-- 
cgit v1.2.1


From 7c4a56fec379ac0d7754e0d4da6a7361f1a4fe64 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 23 Aug 2012 07:05:17 +0000
Subject: tcp: fix cwnd reduction for non-sack recovery

The cwnd reduction in fast recovery is based on the number of packets
newly delivered per ACK. For non-sack connections every DUPACK
signifies a packet has been delivered, but the sender mistakenly
skips counting them for cwnd reduction.

The fix is to compute newly_acked_sacked after DUPACKs are accounted
in sacked_out for non-sack connections.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 85308b90df80..6e38c6c23caa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2926,13 +2926,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
-				  int newly_acked_sacked, bool is_dupack,
+				  int prior_sacked, bool is_dupack,
 				  int flag)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
 				    (tcp_fackets_out(tp) > tp->reordering));
+	int newly_acked_sacked = 0;
 	int fast_rexmit = 0;
 
 	if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2992,6 +2993,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 				tcp_add_reno_sack(sk);
 		} else
 			do_lost = tcp_try_undo_partial(sk, pkts_acked);
+		newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
 		break;
 	case TCP_CA_Loss:
 		if (flag & FLAG_DATA_ACKED)
@@ -3013,6 +3015,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 			if (is_dupack)
 				tcp_add_reno_sack(sk);
 		}
+		newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
 
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
@@ -3590,7 +3593,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets;
 	int prior_sacked = tp->sacked_out;
 	int pkts_acked = 0;
-	int newly_acked_sacked = 0;
 	bool frto_cwnd = false;
 
 	/* If the ack is older than previous acks
@@ -3666,8 +3668,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
 
 	pkts_acked = prior_packets - tp->packets_out;
-	newly_acked_sacked = (prior_packets - prior_sacked) -
-			     (tp->packets_out - tp->sacked_out);
 
 	if (tp->frto_counter)
 		frto_cwnd = tcp_process_frto(sk, flag);
@@ -3681,7 +3681,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		    tcp_may_raise_cwnd(sk, flag))
 			tcp_cong_avoid(sk, ack, prior_in_flight);
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
+		tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
 				      is_dupack, flag);
 	} else {
 		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
@@ -3698,7 +3698,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
+		tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
 				      is_dupack, flag);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
@@ -3718,8 +3718,7 @@ old_ack:
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
-		newly_acked_sacked = tp->sacked_out - prior_sacked;
-		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
+		tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
 				      is_dupack, flag);
 	}
 
-- 
cgit v1.2.1


From 5f2d04f1f9b52604fca6ee08a77972c0df67e082 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:13:55 +0200
Subject: ipv4: fix path MTU discovery with connection tracking

IPv4 conntrack defragments incoming packet at the PRE_ROUTING hook and
(in case of forwarded packets) refragments them at POST_ROUTING
independent of the IP_DF flag. Refragmentation uses the dst_mtu() of
the local route without caring about the original fragment sizes,
thereby breaking PMTUD.

This patch fixes this by keeping track of the largest received fragment
with IP_DF set and generates an ICMP fragmentation required error during
refragmentation if that size exceeds the MTU.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 8 +++++++-
 net/ipv4/ip_output.c   | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..fa6a12c51066 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -523,6 +523,10 @@ found:
 	if (offset == 0)
 		qp->q.last_in |= INET_FRAG_FIRST_IN;
 
+	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+	    skb->len + ihl > qp->q.max_size)
+		qp->q.max_size = skb->len + ihl;
+
 	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    qp->q.meat == qp->q.len)
 		return ip_frag_reasm(qp, prev, dev);
@@ -646,9 +650,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	head->next = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
+	IPCB(head)->frag_max_size = qp->q.max_size;
 
 	iph = ip_hdr(head);
-	iph->frag_off = 0;
+	/* max_size != 0 implies at least one fragment had IP_DF set */
+	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
 	iph->tot_len = htons(len);
 	iph->tos |= ecn;
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c196d749daf2..a5beab1dc958 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -467,7 +467,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 	iph = ip_hdr(skb);
 
-	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+		     (IPCB(skb)->frag_max_size &&
+		      IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(ip_skb_dst_mtu(skb)));
-- 
cgit v1.2.1


From 051966c0c644a1c96092d4206e00704ade813c9a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:04 +0200
Subject: netfilter: nf_nat: add protoff argument to packet mangling functions

For mangling IPv6 packets the protocol header offset needs to be known
by the NAT packet mangling functions. Add a so far unused protoff argument
and convert the conntrack and NAT helpers to use it in preparation of
IPv6 NAT.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  6 +-
 net/ipv4/netfilter/nf_nat_amanda.c             |  3 +-
 net/ipv4/netfilter/nf_nat_ftp.c                |  3 +-
 net/ipv4/netfilter/nf_nat_h323.c               | 48 +++++++------
 net/ipv4/netfilter/nf_nat_helper.c             |  9 ++-
 net/ipv4/netfilter/nf_nat_irc.c                |  3 +-
 net/ipv4/netfilter/nf_nat_pptp.c               |  6 +-
 net/ipv4/netfilter/nf_nat_sip.c                | 96 +++++++++++++++-----------
 8 files changed, 100 insertions(+), 74 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e7ff2dcab6ce..4ada3295d9a7 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,7 +31,8 @@
 
 int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
 			      struct nf_conn *ct,
-			      enum ip_conntrack_info ctinfo);
+			      enum ip_conntrack_info ctinfo,
+			      unsigned int protoff);
 EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
 
 static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
@@ -149,7 +150,8 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
 		typeof(nf_nat_seq_adjust_hook) seq_adjust;
 
 		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-		if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
+		if (!seq_adjust ||
+		    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
 			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
 			return NF_DROP;
 		}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 3c04d24e2976..75464b62f5f2 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -26,6 +26,7 @@ MODULE_ALIAS("ip_nat_amanda");
 
 static unsigned int help(struct sk_buff *skb,
 			 enum ip_conntrack_info ctinfo,
+			 unsigned int protoff,
 			 unsigned int matchoff,
 			 unsigned int matchlen,
 			 struct nf_conntrack_expect *exp)
@@ -61,7 +62,7 @@ static unsigned int help(struct sk_buff *skb,
 
 	sprintf(buffer, "%u", port);
 	ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
-				       matchoff, matchlen,
+				       protoff, matchoff, matchlen,
 				       buffer, strlen(buffer));
 	if (ret != NF_ACCEPT)
 		nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index e462a957d080..5589f3af4a8e 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -55,6 +55,7 @@ static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
 static unsigned int nf_nat_ftp(struct sk_buff *skb,
 			       enum ip_conntrack_info ctinfo,
 			       enum nf_ct_ftp_type type,
+			       unsigned int protoff,
 			       unsigned int matchoff,
 			       unsigned int matchlen,
 			       struct nf_conntrack_expect *exp)
@@ -100,7 +101,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
 
 	pr_debug("calling nf_nat_mangle_tcp_packet\n");
 
-	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
+	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
 				      matchlen, buffer, buflen))
 		goto out;
 
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index c6784a18c1c4..d2c228db38b5 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -21,7 +21,7 @@
 #include <linux/netfilter/nf_conntrack_h323.h>
 
 /****************************************************************************/
-static int set_addr(struct sk_buff *skb,
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
 		    unsigned char **data, int dataoff,
 		    unsigned int addroff, __be32 ip, __be16 port)
 {
@@ -40,7 +40,7 @@ static int set_addr(struct sk_buff *skb,
 
 	if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
 		if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-					      addroff, sizeof(buf),
+					      protoff, addroff, sizeof(buf),
 					      (char *) &buf, sizeof(buf))) {
 			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
 			return -1;
@@ -54,7 +54,7 @@ static int set_addr(struct sk_buff *skb,
 		*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
 	} else {
 		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-					      addroff, sizeof(buf),
+					      protoff, addroff, sizeof(buf),
 					      (char *) &buf, sizeof(buf))) {
 			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
 			return -1;
@@ -69,22 +69,22 @@ static int set_addr(struct sk_buff *skb,
 }
 
 /****************************************************************************/
-static int set_h225_addr(struct sk_buff *skb,
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
 			 unsigned char **data, int dataoff,
 			 TransportAddress *taddr,
 			 union nf_inet_addr *addr, __be16 port)
 {
-	return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
+	return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
 			addr->ip, port);
 }
 
 /****************************************************************************/
-static int set_h245_addr(struct sk_buff *skb,
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
 			 unsigned char **data, int dataoff,
 			 H245_TransportAddress *taddr,
 			 union nf_inet_addr *addr, __be16 port)
 {
-	return set_addr(skb, data, dataoff,
+	return set_addr(skb, protoff, data, dataoff,
 			taddr->unicastAddress.iPAddress.network,
 			addr->ip, port);
 }
@@ -92,7 +92,7 @@ static int set_h245_addr(struct sk_buff *skb,
 /****************************************************************************/
 static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
-			unsigned char **data,
+			unsigned int protoff, unsigned char **data,
 			TransportAddress *taddr, int count)
 {
 	const struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -118,7 +118,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 					 &addr.ip, port,
 					 &ct->tuplehash[!dir].tuple.dst.u3.ip,
 					 info->sig_port[!dir]);
-				return set_h225_addr(skb, data, 0, &taddr[i],
+				return set_h225_addr(skb, protoff, data, 0,
+						     &taddr[i],
 						     &ct->tuplehash[!dir].
 						     tuple.dst.u3,
 						     info->sig_port[!dir]);
@@ -129,7 +130,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 					 &addr.ip, port,
 					 &ct->tuplehash[!dir].tuple.src.u3.ip,
 					 info->sig_port[!dir]);
-				return set_h225_addr(skb, data, 0, &taddr[i],
+				return set_h225_addr(skb, protoff, data, 0,
+						     &taddr[i],
 						     &ct->tuplehash[!dir].
 						     tuple.src.u3,
 						     info->sig_port[!dir]);
@@ -143,7 +145,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
-			unsigned char **data,
+			unsigned int protoff, unsigned char **data,
 			TransportAddress *taddr, int count)
 {
 	int dir = CTINFO2DIR(ctinfo);
@@ -159,7 +161,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
 				 &addr.ip, ntohs(port),
 				 &ct->tuplehash[!dir].tuple.dst.u3.ip,
 				 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
-			return set_h225_addr(skb, data, 0, &taddr[i],
+			return set_h225_addr(skb, protoff, data, 0, &taddr[i],
 					     &ct->tuplehash[!dir].tuple.dst.u3,
 					     ct->tuplehash[!dir].tuple.
 								dst.u.udp.port);
@@ -172,7 +174,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 			enum ip_conntrack_info ctinfo,
-			unsigned char **data, int dataoff,
+			unsigned int protoff, unsigned char **data, int dataoff,
 			H245_TransportAddress *taddr,
 			__be16 port, __be16 rtp_port,
 			struct nf_conntrack_expect *rtp_exp,
@@ -244,7 +246,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	}
 
 	/* Modify signal */
-	if (set_h245_addr(skb, data, dataoff, taddr,
+	if (set_h245_addr(skb, protoff, data, dataoff, taddr,
 			  &ct->tuplehash[!dir].tuple.dst.u3,
 			  htons((port & htons(1)) ? nated_port + 1 :
 						    nated_port)) == 0) {
@@ -275,7 +277,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 		    enum ip_conntrack_info ctinfo,
-		    unsigned char **data, int dataoff,
+		    unsigned int protoff, unsigned char **data, int dataoff,
 		    H245_TransportAddress *taddr, __be16 port,
 		    struct nf_conntrack_expect *exp)
 {
@@ -307,7 +309,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 	}
 
 	/* Modify signal */
-	if (set_h245_addr(skb, data, dataoff, taddr,
+	if (set_h245_addr(skb, protoff, data, dataoff, taddr,
 			  &ct->tuplehash[!dir].tuple.dst.u3,
 			  htons(nated_port)) < 0) {
 		nf_ct_unexpect_related(exp);
@@ -326,7 +328,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 		    enum ip_conntrack_info ctinfo,
-		    unsigned char **data, int dataoff,
+		    unsigned int protoff, unsigned char **data, int dataoff,
 		    TransportAddress *taddr, __be16 port,
 		    struct nf_conntrack_expect *exp)
 {
@@ -363,7 +365,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 	}
 
 	/* Modify signal */
-	if (set_h225_addr(skb, data, dataoff, taddr,
+	if (set_h225_addr(skb, protoff, data, dataoff, taddr,
 			  &ct->tuplehash[!dir].tuple.dst.u3,
 			  htons(nated_port)) == 0) {
 		/* Save ports */
@@ -416,7 +418,8 @@ static void ip_nat_q931_expect(struct nf_conn *new,
 /****************************************************************************/
 static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 		    enum ip_conntrack_info ctinfo,
-		    unsigned char **data, TransportAddress *taddr, int idx,
+		    unsigned int protoff, unsigned char **data,
+		    TransportAddress *taddr, int idx,
 		    __be16 port, struct nf_conntrack_expect *exp)
 {
 	struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -453,7 +456,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 	}
 
 	/* Modify signal */
-	if (set_h225_addr(skb, data, 0, &taddr[idx],
+	if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
 			  &ct->tuplehash[!dir].tuple.dst.u3,
 			  htons(nated_port)) == 0) {
 		/* Save ports */
@@ -464,7 +467,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 		if (idx > 0 &&
 		    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
 		    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
-			set_h225_addr(skb, data, 0, &taddr[0],
+			set_h225_addr(skb, protoff, data, 0, &taddr[0],
 				      &ct->tuplehash[!dir].tuple.dst.u3,
 				      info->sig_port[!dir]);
 		}
@@ -507,6 +510,7 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,
 /****************************************************************************/
 static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 			      enum ip_conntrack_info ctinfo,
+			      unsigned int protoff,
 			      unsigned char **data, int dataoff,
 			      TransportAddress *taddr, __be16 port,
 			      struct nf_conntrack_expect *exp)
@@ -541,7 +545,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 	}
 
 	/* Modify signal */
-	if (!set_h225_addr(skb, data, dataoff, taddr,
+	if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
 			   &ct->tuplehash[!dir].tuple.dst.u3,
 			   htons(nated_port)) == 0) {
 		nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 2e59ad0b90ca..2fefec5e757c 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -206,6 +206,7 @@ static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data
 int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			       struct nf_conn *ct,
 			       enum ip_conntrack_info ctinfo,
+			       unsigned int protoff,
 			       unsigned int match_offset,
 			       unsigned int match_len,
 			       const char *rep_buffer,
@@ -257,6 +258,7 @@ int
 nf_nat_mangle_udp_packet(struct sk_buff *skb,
 			 struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo,
+			 unsigned int protoff,
 			 unsigned int match_offset,
 			 unsigned int match_len,
 			 const char *rep_buffer,
@@ -387,7 +389,8 @@ nf_nat_sack_adjust(struct sk_buff *skb,
 int
 nf_nat_seq_adjust(struct sk_buff *skb,
 		  struct nf_conn *ct,
-		  enum ip_conntrack_info ctinfo)
+		  enum ip_conntrack_info ctinfo,
+		  unsigned int protoff)
 {
 	struct tcphdr *tcph;
 	int dir;
@@ -401,10 +404,10 @@ nf_nat_seq_adjust(struct sk_buff *skb,
 	this_way = &nat->seq[dir];
 	other_way = &nat->seq[!dir];
 
-	if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
 		return 0;
 
-	tcph = (void *)skb->data + ip_hdrlen(skb);
+	tcph = (void *)skb->data + protoff;
 	if (after(ntohl(tcph->seq), this_way->correction_pos))
 		seqoff = this_way->offset_after;
 	else
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index 979ae165f4ef..5b0c20a1f08d 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -29,6 +29,7 @@ MODULE_ALIAS("ip_nat_irc");
 
 static unsigned int help(struct sk_buff *skb,
 			 enum ip_conntrack_info ctinfo,
+			 unsigned int protoff,
 			 unsigned int matchoff,
 			 unsigned int matchlen,
 			 struct nf_conntrack_expect *exp)
@@ -66,7 +67,7 @@ static unsigned int help(struct sk_buff *skb,
 		 buffer, &ip, port);
 
 	ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
-				       matchoff, matchlen, buffer,
+				       protoff, matchoff, matchlen, buffer,
 				       strlen(buffer));
 	if (ret != NF_ACCEPT)
 		nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 388140881ebe..31ef890d894b 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -113,6 +113,7 @@ static int
 pptp_outbound_pkt(struct sk_buff *skb,
 		  struct nf_conn *ct,
 		  enum ip_conntrack_info ctinfo,
+		  unsigned int protoff,
 		  struct PptpControlHeader *ctlh,
 		  union pptp_ctrl_union *pptpReq)
 
@@ -175,7 +176,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
 		 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
 
 	/* mangle packet */
-	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
 				     cid_off + sizeof(struct pptp_pkt_hdr) +
 				     sizeof(struct PptpControlHeader),
 				     sizeof(new_callid), (char *)&new_callid,
@@ -216,6 +217,7 @@ static int
 pptp_inbound_pkt(struct sk_buff *skb,
 		 struct nf_conn *ct,
 		 enum ip_conntrack_info ctinfo,
+		 unsigned int protoff,
 		 struct PptpControlHeader *ctlh,
 		 union pptp_ctrl_union *pptpReq)
 {
@@ -268,7 +270,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
 	pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
 		 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
 
-	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
 				     pcid_off + sizeof(struct pptp_pkt_hdr) +
 				     sizeof(struct PptpControlHeader),
 				     sizeof(new_pcid), (char *)&new_pcid,
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 4ad9cf173992..df626af8413c 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -30,7 +30,8 @@ MODULE_DESCRIPTION("SIP NAT helper");
 MODULE_ALIAS("ip_nat_sip");
 
 
-static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
+				  unsigned int dataoff,
 				  const char **dptr, unsigned int *datalen,
 				  unsigned int matchoff, unsigned int matchlen,
 				  const char *buffer, unsigned int buflen)
@@ -46,7 +47,7 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
 		matchoff += dataoff - baseoff;
 
 		if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-						matchoff, matchlen,
+						protoff, matchoff, matchlen,
 						buffer, buflen, false))
 			return 0;
 	} else {
@@ -54,7 +55,7 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
 		matchoff += dataoff - baseoff;
 
 		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-					      matchoff, matchlen,
+					      protoff, matchoff, matchlen,
 					      buffer, buflen))
 			return 0;
 	}
@@ -65,7 +66,8 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
 	return 1;
 }
 
-static int map_addr(struct sk_buff *skb, unsigned int dataoff,
+static int map_addr(struct sk_buff *skb, unsigned int protoff,
+		    unsigned int dataoff,
 		    const char **dptr, unsigned int *datalen,
 		    unsigned int matchoff, unsigned int matchlen,
 		    union nf_inet_addr *addr, __be16 port)
@@ -94,11 +96,12 @@ static int map_addr(struct sk_buff *skb, unsigned int dataoff,
 
 	buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
 
-	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			     buffer, buflen);
+	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+			     matchoff, matchlen, buffer, buflen);
 }
 
-static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
+static int map_sip_addr(struct sk_buff *skb, unsigned int protoff,
+			unsigned int dataoff,
 			const char **dptr, unsigned int *datalen,
 			enum sip_header_types type)
 {
@@ -111,11 +114,12 @@ static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
 	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
 				    &matchoff, &matchlen, &addr, &port) <= 0)
 		return 1;
-	return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			&addr, port);
+	return map_addr(skb, protoff, dataoff, dptr, datalen,
+			matchoff, matchlen, &addr, port);
 }
 
-static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int protoff,
+			       unsigned int dataoff,
 			       const char **dptr, unsigned int *datalen)
 {
 	enum ip_conntrack_info ctinfo;
@@ -132,8 +136,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		if (ct_sip_parse_request(ct, *dptr, *datalen,
 					 &matchoff, &matchlen,
 					 &addr, &port) > 0 &&
-		    !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			      &addr, port))
+		    !map_addr(skb, protoff, dataoff, dptr, datalen,
+			      matchoff, matchlen, &addr, port))
 			return NF_DROP;
 		request = 1;
 	} else
@@ -164,8 +168,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		}
 
 		olen = *datalen;
-		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			      &addr, port))
+		if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+			      matchoff, matchlen, &addr, port))
 			return NF_DROP;
 
 		matchend = matchoff + matchlen + *datalen - olen;
@@ -179,7 +183,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
 			buflen = sprintf(buffer, "%pI4",
 					&ct->tuplehash[!dir].tuple.dst.u3.ip);
-			if (!mangle_packet(skb, dataoff, dptr, datalen,
+			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 					   poff, plen, buffer, buflen))
 				return NF_DROP;
 		}
@@ -193,7 +197,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
 			buflen = sprintf(buffer, "%pI4",
 					&ct->tuplehash[!dir].tuple.src.u3.ip);
-			if (!mangle_packet(skb, dataoff, dptr, datalen,
+			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 					   poff, plen, buffer, buflen))
 				return NF_DROP;
 		}
@@ -207,7 +211,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
 		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
 			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
 			buflen = sprintf(buffer, "%u", ntohs(p));
-			if (!mangle_packet(skb, dataoff, dptr, datalen,
+			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 					   poff, plen, buffer, buflen))
 				return NF_DROP;
 		}
@@ -221,13 +225,14 @@ next:
 				       SIP_HDR_CONTACT, &in_header,
 				       &matchoff, &matchlen,
 				       &addr, &port) > 0) {
-		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+		if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+			      matchoff, matchlen,
 			      &addr, port))
 			return NF_DROP;
 	}
 
-	if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
-	    !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
+	if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+	    !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO))
 		return NF_DROP;
 
 	return NF_ACCEPT;
@@ -272,7 +277,8 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
 	}
 }
 
-static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
+				      unsigned int dataoff,
 				      const char **dptr, unsigned int *datalen,
 				      struct nf_conntrack_expect *exp,
 				      unsigned int matchoff,
@@ -326,7 +332,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
 	if (exp->tuple.dst.u3.ip != exp->saved_ip ||
 	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
 		buflen = sprintf(buffer, "%pI4:%u", &newip, port);
-		if (!mangle_packet(skb, dataoff, dptr, datalen,
+		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 				   matchoff, matchlen, buffer, buflen))
 			goto err;
 	}
@@ -337,7 +343,8 @@ err:
 	return NF_DROP;
 }
 
-static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
+static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
+			      unsigned int dataoff,
 			      const char **dptr, unsigned int *datalen)
 {
 	enum ip_conntrack_info ctinfo;
@@ -359,11 +366,12 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
 		return 0;
 
 	buflen = sprintf(buffer, "%u", c_len);
-	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			     buffer, buflen);
+	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+			     matchoff, matchlen, buffer, buflen);
 }
 
-static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff,
+			     unsigned int dataoff,
 			     const char **dptr, unsigned int *datalen,
 			     unsigned int sdpoff,
 			     enum sdp_header_types type,
@@ -377,11 +385,12 @@ static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
 	if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
 				  &matchoff, &matchlen) <= 0)
 		return -ENOENT;
-	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			     buffer, buflen) ? 0 : -EINVAL;
+	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+			     matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL;
 }
 
-static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
+				    unsigned int dataoff,
 				    const char **dptr, unsigned int *datalen,
 				    unsigned int sdpoff,
 				    enum sdp_header_types type,
@@ -392,14 +401,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
 	unsigned int buflen;
 
 	buflen = sprintf(buffer, "%pI4", &addr->ip);
-	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
-			      buffer, buflen))
+	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
+			      sdpoff, type, term, buffer, buflen))
 		return 0;
 
-	return mangle_content_len(skb, dataoff, dptr, datalen);
+	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
 }
 
-static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
+				    unsigned int dataoff,
 				    const char **dptr, unsigned int *datalen,
 				    unsigned int matchoff,
 				    unsigned int matchlen,
@@ -409,14 +419,15 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
 	unsigned int buflen;
 
 	buflen = sprintf(buffer, "%u", port);
-	if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
-			   buffer, buflen))
+	if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+			   matchoff, matchlen, buffer, buflen))
 		return 0;
 
-	return mangle_content_len(skb, dataoff, dptr, datalen);
+	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
 }
 
-static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int protoff,
+				       unsigned int dataoff,
 				       const char **dptr, unsigned int *datalen,
 				       unsigned int sdpoff,
 				       const union nf_inet_addr *addr)
@@ -426,12 +437,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff
 
 	/* Mangle session description owner and contact addresses */
 	buflen = sprintf(buffer, "%pI4", &addr->ip);
-	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
+	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
 			       SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
 			       buffer, buflen))
 		return 0;
 
-	switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
+	switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
 				  SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
 				  buffer, buflen)) {
 	case 0:
@@ -448,12 +459,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff
 		return 0;
 	}
 
-	return mangle_content_len(skb, dataoff, dptr, datalen);
+	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
 }
 
 /* So, this packet has hit the connection tracking matching code.
    Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
+static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
+				     unsigned int dataoff,
 				     const char **dptr, unsigned int *datalen,
 				     struct nf_conntrack_expect *rtp_exp,
 				     struct nf_conntrack_expect *rtcp_exp,
@@ -514,7 +526,7 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
 
 	/* Update media port. */
 	if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
-	    !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
+	    !ip_nat_sdp_port(skb, protoff, dataoff, dptr, datalen,
 			     mediaoff, medialen, port))
 		goto err2;
 
-- 
cgit v1.2.1


From c7232c9979cba684c50b64c513c4a83c9aa70563 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:06 +0200
Subject: netfilter: add protocol independent NAT core

Convert the IPv4 NAT implementation to a protocol independent core and
address family specific modules.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter.c                           |  37 --
 net/ipv4/netfilter/Kconfig                     |  64 +--
 net/ipv4/netfilter/Makefile                    |  11 +-
 net/ipv4/netfilter/ipt_MASQUERADE.c            |  15 +-
 net/ipv4/netfilter/ipt_NETMAP.c                |  15 +-
 net/ipv4/netfilter/ipt_REDIRECT.c              |  15 +-
 net/ipv4/netfilter/iptable_nat.c               | 320 +++++++++++
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |   6 -
 net/ipv4/netfilter/nf_nat_amanda.c             |   1 -
 net/ipv4/netfilter/nf_nat_core.c               | 763 -------------------------
 net/ipv4/netfilter/nf_nat_ftp.c                |   1 -
 net/ipv4/netfilter/nf_nat_h323.c               |  23 +-
 net/ipv4/netfilter/nf_nat_helper.c             | 461 ---------------
 net/ipv4/netfilter/nf_nat_irc.c                |   1 -
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       | 281 +++++++++
 net/ipv4/netfilter/nf_nat_pptp.c               |  15 +-
 net/ipv4/netfilter/nf_nat_proto_common.c       | 114 ----
 net/ipv4/netfilter/nf_nat_proto_dccp.c         | 106 ----
 net/ipv4/netfilter/nf_nat_proto_gre.c          |  30 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c         |  24 +-
 net/ipv4/netfilter/nf_nat_proto_sctp.c         |  96 ----
 net/ipv4/netfilter/nf_nat_proto_tcp.c          |  91 ---
 net/ipv4/netfilter/nf_nat_proto_udp.c          |  82 ---
 net/ipv4/netfilter/nf_nat_proto_udplite.c      |  98 ----
 net/ipv4/netfilter/nf_nat_proto_unknown.c      |  52 --
 net/ipv4/netfilter/nf_nat_rule.c               | 214 -------
 net/ipv4/netfilter/nf_nat_sip.c                |  19 +-
 net/ipv4/netfilter/nf_nat_standalone.c         | 326 -----------
 net/ipv4/netfilter/nf_nat_tftp.c               |   1 -
 29 files changed, 709 insertions(+), 2573 deletions(-)
 create mode 100644 net/ipv4/netfilter/iptable_nat.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_core.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_helper.c
 create mode 100644 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_common.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_dccp.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_sctp.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_tcp.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_udp.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_udplite.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_proto_unknown.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_rule.c
 delete mode 100644 net/ipv4/netfilter/nf_nat_standalone.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ed1b36783192..f1643c0c3587 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -72,43 +72,6 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
-#ifdef CONFIG_XFRM
-int ip_xfrm_me_harder(struct sk_buff *skb)
-{
-	struct flowi fl;
-	unsigned int hh_len;
-	struct dst_entry *dst;
-
-	if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
-		return 0;
-	if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
-		return -1;
-
-	dst = skb_dst(skb);
-	if (dst->xfrm)
-		dst = ((struct xfrm_dst *)dst)->route;
-	dst_hold(dst);
-
-	dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
-	if (IS_ERR(dst))
-		return -1;
-
-	skb_dst_drop(skb);
-	skb_dst_set(skb, dst);
-
-	/* Change in oif may mean change in hh_len. */
-	hh_len = skb_dst(skb)->dev->hard_header_len;
-	if (skb_headroom(skb) < hh_len &&
-	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
-		return -1;
-	return 0;
-}
-EXPORT_SYMBOL(ip_xfrm_me_harder);
-#endif
-
-void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(ip_nat_decode_session);
-
 /*
  * Extra routing may needed on local out, as the QUEUE target never
  * returns control to the table.
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fcc543cd987a..b26629681bdb 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -143,25 +143,22 @@ config IP_NF_TARGET_ULOG
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # NAT + specific targets: nf_conntrack
-config NF_NAT
-	tristate "Full NAT"
+config NF_NAT_IPV4
+	tristate "IPv4 NAT"
 	depends on NF_CONNTRACK_IPV4
 	default m if NETFILTER_ADVANCED=n
+	select NF_NAT
 	help
-	  The Full NAT option allows masquerading, port forwarding and other
+	  The IPv4 NAT option allows masquerading, port forwarding and other
 	  forms of full Network Address Port Translation.  It is controlled by
 	  the `nat' table in iptables: see the man page for iptables(8).
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config NF_NAT_NEEDED
-	bool
-	depends on NF_NAT
-	default y
+if NF_NAT_IPV4
 
 config IP_NF_TARGET_MASQUERADE
 	tristate "MASQUERADE target support"
-	depends on NF_NAT
 	default m if NETFILTER_ADVANCED=n
 	help
 	  Masquerading is a special case of NAT: all outgoing connections are
@@ -174,7 +171,6 @@ config IP_NF_TARGET_MASQUERADE
 
 config IP_NF_TARGET_NETMAP
 	tristate "NETMAP target support"
-	depends on NF_NAT
 	depends on NETFILTER_ADVANCED
 	help
 	  NETMAP is an implementation of static 1:1 NAT mapping of network
@@ -185,7 +181,6 @@ config IP_NF_TARGET_NETMAP
 
 config IP_NF_TARGET_REDIRECT
 	tristate "REDIRECT target support"
-	depends on NF_NAT
 	depends on NETFILTER_ADVANCED
 	help
 	  REDIRECT is a special case of NAT: all incoming connections are
@@ -195,9 +190,11 @@ config IP_NF_TARGET_REDIRECT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+endif
+
 config NF_NAT_SNMP_BASIC
 	tristate "Basic SNMP-ALG support"
-	depends on NF_CONNTRACK_SNMP && NF_NAT
+	depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
 	depends on NETFILTER_ADVANCED
 	default NF_NAT && NF_CONNTRACK_SNMP
 	---help---
@@ -219,61 +216,46 @@ config NF_NAT_SNMP_BASIC
 #           <expr> '&&' <expr>                   (6)
 #
 # (6) Returns the result of min(/expr/, /expr/).
-config NF_NAT_PROTO_DCCP
-	tristate
-	depends on NF_NAT && NF_CT_PROTO_DCCP
-	default NF_NAT && NF_CT_PROTO_DCCP
 
 config NF_NAT_PROTO_GRE
 	tristate
-	depends on NF_NAT && NF_CT_PROTO_GRE
-
-config NF_NAT_PROTO_UDPLITE
-	tristate
-	depends on NF_NAT && NF_CT_PROTO_UDPLITE
-	default NF_NAT && NF_CT_PROTO_UDPLITE
-
-config NF_NAT_PROTO_SCTP
-	tristate
-	default NF_NAT && NF_CT_PROTO_SCTP
-	depends on NF_NAT && NF_CT_PROTO_SCTP
-	select LIBCRC32C
+	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
 
 config NF_NAT_FTP
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_FTP
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_FTP
 
 config NF_NAT_IRC
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_IRC
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_IRC
 
 config NF_NAT_TFTP
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_TFTP
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_TFTP
 
 config NF_NAT_AMANDA
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_AMANDA
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_AMANDA
 
 config NF_NAT_PPTP
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_PPTP
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
 	select NF_NAT_PROTO_GRE
 
 config NF_NAT_H323
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_H323
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_H323
 
 config NF_NAT_SIP
 	tristate
-	depends on NF_CONNTRACK && NF_NAT
-	default NF_NAT && NF_CONNTRACK_SIP
+	depends on NF_CONNTRACK && NF_NAT_IPV4
+	default NF_NAT_IPV4 && NF_CONNTRACK_SIP
 
 # mangle + specific targets
 config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c20674dc9452..0ea3acc510e2 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,13 +10,11 @@ nf_conntrack_ipv4-objs	+= nf_conntrack_l3proto_ipv4_compat.o
 endif
 endif
 
-nf_nat-y		:= nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
-iptable_nat-y	:= nf_nat_rule.o nf_nat_standalone.o
-
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
 
-obj-$(CONFIG_NF_NAT) += nf_nat.o
+nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 
 # defrag
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
@@ -32,10 +30,7 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
 # NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -43,7 +38,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
 
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index cbb6a1a6f6f7..1c3aa28b51ae 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
 #include <net/ip.h>
 #include <net/checksum.h>
 #include <net/route.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -49,7 +49,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_conn *ct;
 	struct nf_conn_nat *nat;
 	enum ip_conntrack_info ctinfo;
-	struct nf_nat_ipv4_range newrange;
+	struct nf_nat_range newrange;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 	const struct rtable *rt;
 	__be32 newsrc, nh;
@@ -80,10 +80,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	nat->masq_index = par->out->ifindex;
 
 	/* Transfer from original range. */
-	newrange = ((struct nf_nat_ipv4_range)
-		{ mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
-		  newsrc, newsrc,
-		  mr->range[0].min, mr->range[0].max });
+	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+	newrange.flags       = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+	newrange.min_addr.ip = newsrc;
+	newrange.max_addr.ip = newsrc;
+	newrange.min_proto   = mr->range[0].min;
+	newrange.max_proto   = mr->range[0].max;
 
 	/* Hand modified range to generic setup. */
 	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index b5bfbbabf70d..85028dc0425d 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -16,7 +16,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
@@ -44,7 +44,7 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	__be32 new_ip, netmask;
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_ipv4_range newrange;
+	struct nf_nat_range newrange;
 
 	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
 		     par->hooknum == NF_INET_POST_ROUTING ||
@@ -61,10 +61,13 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		new_ip = ip_hdr(skb)->saddr & ~netmask;
 	new_ip |= mr->range[0].min_ip & netmask;
 
-	newrange = ((struct nf_nat_ipv4_range)
-		{ mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
-		  new_ip, new_ip,
-		  mr->range[0].min, mr->range[0].max });
+	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+	newrange.min_addr.ip = new_ip;
+	newrange.max_addr.ip = new_ip;
+	newrange.min_proto   = mr->range[0].min;
+	newrange.max_proto   = mr->range[0].max;
 
 	/* Hand modified range to generic setup. */
 	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 7c0103a5203e..11407d7d2472 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -19,7 +19,7 @@
 #include <net/checksum.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -48,7 +48,7 @@ redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	__be32 newdst;
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_ipv4_range newrange;
+	struct nf_nat_range newrange;
 
 	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
 		     par->hooknum == NF_INET_LOCAL_OUT);
@@ -76,10 +76,13 @@ redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	}
 
 	/* Transfer from original range. */
-	newrange = ((struct nf_nat_ipv4_range)
-		{ mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
-		  newdst, newdst,
-		  mr->range[0].min, mr->range[0].max });
+	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+	newrange.min_addr.ip = newdst;
+	newrange.max_addr.ip = newdst;
+	newrange.min_proto   = mr->range[0].min;
+	newrange.max_proto   = mr->range[0].max;
 
 	/* Hand modified range to generic setup. */
 	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 000000000000..9e0ffaf1d942
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,320 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv4_table = {
+	.name		= "nat",
+	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+};
+
+static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+	/* Force range to this IP; let proto decide mapping for
+	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+	 */
+	struct nf_nat_range range;
+
+	range.flags = 0;
+	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+		 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
+	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
+				     const struct net_device *in,
+				     const struct net_device *out,
+				     struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	unsigned int ret;
+
+	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
+	if (ret == NF_ACCEPT) {
+		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
+			ret = alloc_null_binding(ct, hooknum);
+	}
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_fn(unsigned int hooknum,
+	       struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_nat *nat;
+	/* maniptype == SRC for postrouting. */
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+	/* We never see fragments: conntrack defrags on pre-routing
+	 * and local-out, and nf_nat_out protects post-routing.
+	 */
+	NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	 * have dropped it.  Hence it's the user's responsibilty to
+	 * packet filter it out, or implement conntrack/NAT for that
+	 * protocol. 8) --RR
+	 */
+	if (!ct)
+		return NF_ACCEPT;
+
+	/* Don't try to NAT if this packet is not conntracked */
+	if (nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (!nat) {
+		/* NAT module was loaded late. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL) {
+			pr_debug("failed to add NAT extension\n");
+			return NF_ACCEPT;
+		}
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			if (ret != NF_ACCEPT)
+				return ret;
+		} else
+			pr_debug("Already setup manip %s for ct %p\n",
+				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct);
+		break;
+
+	default:
+		/* ESTABLISHED */
+		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+	}
+
+	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+}
+
+static unsigned int
+nf_nat_ipv4_in(unsigned int hooknum,
+	       struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	__be32 daddr = ip_hdr(skb)->daddr;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    daddr != ip_hdr(skb)->daddr)
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_out(unsigned int hooknum,
+		struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+#ifdef CONFIG_XFRM
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+#endif
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all))
+			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
+				ret = NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_local_fn(unsigned int hooknum,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_out,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_local_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+static int __net_init iptable_nat_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.nat_table))
+		return PTR_ERR(net->ipv4.nat_table);
+	return 0;
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+	.init	= iptable_nat_net_init,
+	.exit	= iptable_nat_net_exit,
+};
+
+static int __init iptable_nat_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&iptable_nat_net_ops);
+	if (err < 0)
+		goto err1;
+
+	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+	if (err < 0)
+		goto err2;
+	return 0;
+
+err2:
+	unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+	return err;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+	unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 4ada3295d9a7..fcdd0c2406e6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,12 +29,6 @@
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 #include <net/netfilter/nf_log.h>
 
-int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
-			      struct nf_conn *ct,
-			      enum ip_conntrack_info ctinfo,
-			      unsigned int protoff);
-EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
-
 static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
 			      struct nf_conntrack_tuple *tuple)
 {
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 75464b62f5f2..42d337881171 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -16,7 +16,6 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_amanda.h>
 
 MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
deleted file mode 100644
index 44b082fd48ab..000000000000
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ /dev/null
@@ -1,763 +0,0 @@
-/* NAT for netfilter; shared with compatibility layer. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/gfp.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>  /* For tcp_prot in getorigdst */
-#include <linux/icmp.h>
-#include <linux/udp.h>
-#include <linux/jhash.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-
-static DEFINE_SPINLOCK(nf_nat_lock);
-
-static struct nf_conntrack_l3proto *l3proto __read_mostly;
-
-#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
-						__read_mostly;
-
-static inline const struct nf_nat_protocol *
-__nf_nat_proto_find(u_int8_t protonum)
-{
-	return rcu_dereference(nf_nat_protos[protonum]);
-}
-
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
-	    const struct nf_conntrack_tuple *tuple)
-{
-	unsigned int hash;
-
-	/* Original src, to ensure we map it consistently if poss. */
-	hash = jhash_3words((__force u32)tuple->src.u3.ip,
-			    (__force u32)tuple->src.u.all ^ zone,
-			    tuple->dst.protonum, nf_conntrack_hash_rnd);
-	return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
-}
-
-/* Is this tuple already taken? (not by us) */
-int
-nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-		  const struct nf_conn *ignored_conntrack)
-{
-	/* Conntrack tracking doesn't keep track of outgoing tuples; only
-	   incoming ones.  NAT means they don't have a fixed mapping,
-	   so we invert the tuple and look for the incoming reply.
-
-	   We could keep a separate hash if this proves too slow. */
-	struct nf_conntrack_tuple reply;
-
-	nf_ct_invert_tuplepr(&reply, tuple);
-	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
-}
-EXPORT_SYMBOL(nf_nat_used_tuple);
-
-/* If we source map this tuple so reply looks like reply_tuple, will
- * that meet the constraints of range. */
-static int
-in_range(const struct nf_conntrack_tuple *tuple,
-	 const struct nf_nat_ipv4_range *range)
-{
-	const struct nf_nat_protocol *proto;
-	int ret = 0;
-
-	/* If we are supposed to map IPs, then we must be in the
-	   range specified, otherwise let this drag us onto a new src IP. */
-	if (range->flags & NF_NAT_RANGE_MAP_IPS) {
-		if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
-		    ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
-			return 0;
-	}
-
-	rcu_read_lock();
-	proto = __nf_nat_proto_find(tuple->dst.protonum);
-	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
-	    proto->in_range(tuple, NF_NAT_MANIP_SRC,
-			    &range->min, &range->max))
-		ret = 1;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline int
-same_src(const struct nf_conn *ct,
-	 const struct nf_conntrack_tuple *tuple)
-{
-	const struct nf_conntrack_tuple *t;
-
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	return (t->dst.protonum == tuple->dst.protonum &&
-		t->src.u3.ip == tuple->src.u3.ip &&
-		t->src.u.all == tuple->src.u.all);
-}
-
-/* Only called for SRC manip */
-static int
-find_appropriate_src(struct net *net, u16 zone,
-		     const struct nf_conntrack_tuple *tuple,
-		     struct nf_conntrack_tuple *result,
-		     const struct nf_nat_ipv4_range *range)
-{
-	unsigned int h = hash_by_src(net, zone, tuple);
-	const struct nf_conn_nat *nat;
-	const struct nf_conn *ct;
-	const struct hlist_node *n;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
-		ct = nat->ct;
-		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
-			/* Copy source part from reply tuple. */
-			nf_ct_invert_tuplepr(result,
-				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-			result->dst = tuple->dst;
-
-			if (in_range(result, range)) {
-				rcu_read_unlock();
-				return 1;
-			}
-		}
-	}
-	rcu_read_unlock();
-	return 0;
-}
-
-/* For [FUTURE] fragmentation handling, we want the least-used
-   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
-   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
-   1-65535, we don't do pro-rata allocation based on ports; we choose
-   the ip with the lowest src-ip/dst-ip/proto usage.
-*/
-static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_ipv4_range *range,
-		    const struct nf_conn *ct,
-		    enum nf_nat_manip_type maniptype)
-{
-	__be32 *var_ipp;
-	/* Host order */
-	u_int32_t minip, maxip, j;
-
-	/* No IP mapping?  Do nothing. */
-	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
-		return;
-
-	if (maniptype == NF_NAT_MANIP_SRC)
-		var_ipp = &tuple->src.u3.ip;
-	else
-		var_ipp = &tuple->dst.u3.ip;
-
-	/* Fast path: only one choice. */
-	if (range->min_ip == range->max_ip) {
-		*var_ipp = range->min_ip;
-		return;
-	}
-
-	/* Hashing source and destination IPs gives a fairly even
-	 * spread in practice (if there are a small number of IPs
-	 * involved, there usually aren't that many connections
-	 * anyway).  The consistency means that servers see the same
-	 * client coming from the same IP (some Internet Banking sites
-	 * like this), even across reboots. */
-	minip = ntohl(range->min_ip);
-	maxip = ntohl(range->max_ip);
-	j = jhash_2words((__force u32)tuple->src.u3.ip,
-			 range->flags & NF_NAT_RANGE_PERSISTENT ?
-				0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
-	j = ((u64)j * (maxip - minip + 1)) >> 32;
-	*var_ipp = htonl(minip + j);
-}
-
-/* Manipulate the tuple into the range given.  For NF_INET_POST_ROUTING,
- * we change the source to map into the range.  For NF_INET_PRE_ROUTING
- * and NF_INET_LOCAL_OUT, we change the destination to map into the
- * range.  It might not be possible to get a unique tuple, but we try.
- * At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
-static void
-get_unique_tuple(struct nf_conntrack_tuple *tuple,
-		 const struct nf_conntrack_tuple *orig_tuple,
-		 const struct nf_nat_ipv4_range *range,
-		 struct nf_conn *ct,
-		 enum nf_nat_manip_type maniptype)
-{
-	struct net *net = nf_ct_net(ct);
-	const struct nf_nat_protocol *proto;
-	u16 zone = nf_ct_zone(ct);
-
-	/* 1) If this srcip/proto/src-proto-part is currently mapped,
-	   and that same mapping gives a unique tuple within the given
-	   range, use that.
-
-	   This is only required for source (ie. NAT/masq) mappings.
-	   So far, we don't do local source mappings, so multiple
-	   manips not an issue.  */
-	if (maniptype == NF_NAT_MANIP_SRC &&
-	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
-		/* try the original tuple first */
-		if (in_range(orig_tuple, range)) {
-			if (!nf_nat_used_tuple(orig_tuple, ct)) {
-				*tuple = *orig_tuple;
-				return;
-			}
-		} else if (find_appropriate_src(net, zone, orig_tuple, tuple,
-			   range)) {
-			pr_debug("get_unique_tuple: Found current src map\n");
-			if (!nf_nat_used_tuple(tuple, ct))
-				return;
-		}
-	}
-
-	/* 2) Select the least-used IP/proto combination in the given
-	   range. */
-	*tuple = *orig_tuple;
-	find_best_ips_proto(zone, tuple, range, ct, maniptype);
-
-	/* 3) The per-protocol part of the manip is made to map into
-	   the range to make a unique tuple. */
-
-	rcu_read_lock();
-	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
-
-	/* Only bother mapping if it's not already in range and unique */
-	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
-		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
-			if (proto->in_range(tuple, maniptype, &range->min,
-					    &range->max) &&
-			    (range->min.all == range->max.all ||
-			     !nf_nat_used_tuple(tuple, ct)))
-				goto out;
-		} else if (!nf_nat_used_tuple(tuple, ct)) {
-			goto out;
-		}
-	}
-
-	/* Last change: get protocol to try to obtain unique tuple. */
-	proto->unique_tuple(tuple, range, maniptype, ct);
-out:
-	rcu_read_unlock();
-}
-
-unsigned int
-nf_nat_setup_info(struct nf_conn *ct,
-		  const struct nf_nat_ipv4_range *range,
-		  enum nf_nat_manip_type maniptype)
-{
-	struct net *net = nf_ct_net(ct);
-	struct nf_conntrack_tuple curr_tuple, new_tuple;
-	struct nf_conn_nat *nat;
-
-	/* nat helper or nfctnetlink also setup binding */
-	nat = nfct_nat(ct);
-	if (!nat) {
-		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-		if (nat == NULL) {
-			pr_debug("failed to add NAT extension\n");
-			return NF_ACCEPT;
-		}
-	}
-
-	NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
-		     maniptype == NF_NAT_MANIP_DST);
-	BUG_ON(nf_nat_initialized(ct, maniptype));
-
-	/* What we've got will look like inverse of reply. Normally
-	   this is what is in the conntrack, except for prior
-	   manipulations (future optimization: if num_manips == 0,
-	   orig_tp =
-	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
-	nf_ct_invert_tuplepr(&curr_tuple,
-			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
-	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
-
-	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
-		struct nf_conntrack_tuple reply;
-
-		/* Alter conntrack table so will recognize replies. */
-		nf_ct_invert_tuplepr(&reply, &new_tuple);
-		nf_conntrack_alter_reply(ct, &reply);
-
-		/* Non-atomic: we own this at the moment. */
-		if (maniptype == NF_NAT_MANIP_SRC)
-			ct->status |= IPS_SRC_NAT;
-		else
-			ct->status |= IPS_DST_NAT;
-	}
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		unsigned int srchash;
-
-		srchash = hash_by_src(net, nf_ct_zone(ct),
-				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		spin_lock_bh(&nf_nat_lock);
-		/* nf_conntrack_alter_reply might re-allocate extension area */
-		nat = nfct_nat(ct);
-		nat->ct = ct;
-		hlist_add_head_rcu(&nat->bysource,
-				   &net->ipv4.nat_bysource[srchash]);
-		spin_unlock_bh(&nf_nat_lock);
-	}
-
-	/* It's done. */
-	if (maniptype == NF_NAT_MANIP_DST)
-		ct->status |= IPS_DST_NAT_DONE;
-	else
-		ct->status |= IPS_SRC_NAT_DONE;
-
-	return NF_ACCEPT;
-}
-EXPORT_SYMBOL(nf_nat_setup_info);
-
-/* Returns true if succeeded. */
-static bool
-manip_pkt(u_int16_t proto,
-	  struct sk_buff *skb,
-	  unsigned int iphdroff,
-	  const struct nf_conntrack_tuple *target,
-	  enum nf_nat_manip_type maniptype)
-{
-	struct iphdr *iph;
-	const struct nf_nat_protocol *p;
-
-	if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
-		return false;
-
-	iph = (void *)skb->data + iphdroff;
-
-	/* Manipulate protcol part. */
-
-	/* rcu_read_lock()ed by nf_hook_slow */
-	p = __nf_nat_proto_find(proto);
-	if (!p->manip_pkt(skb, iphdroff, target, maniptype))
-		return false;
-
-	iph = (void *)skb->data + iphdroff;
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
-		iph->saddr = target->src.u3.ip;
-	} else {
-		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
-		iph->daddr = target->dst.u3.ip;
-	}
-	return true;
-}
-
-/* Do packet manipulations according to nf_nat_setup_info. */
-unsigned int nf_nat_packet(struct nf_conn *ct,
-			   enum ip_conntrack_info ctinfo,
-			   unsigned int hooknum,
-			   struct sk_buff *skb)
-{
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	unsigned long statusbit;
-	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
-
-	if (mtype == NF_NAT_MANIP_SRC)
-		statusbit = IPS_SRC_NAT;
-	else
-		statusbit = IPS_DST_NAT;
-
-	/* Invert if this is reply dir. */
-	if (dir == IP_CT_DIR_REPLY)
-		statusbit ^= IPS_NAT_MASK;
-
-	/* Non-atomic: these bits don't change. */
-	if (ct->status & statusbit) {
-		struct nf_conntrack_tuple target;
-
-		/* We are aiming to look like inverse of other direction. */
-		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-
-		if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
-			return NF_DROP;
-	}
-	return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_nat_packet);
-
-/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int nf_nat_icmp_reply_translation(struct nf_conn *ct,
-				  enum ip_conntrack_info ctinfo,
-				  unsigned int hooknum,
-				  struct sk_buff *skb)
-{
-	struct {
-		struct icmphdr icmp;
-		struct iphdr ip;
-	} *inside;
-	struct nf_conntrack_tuple target;
-	int hdrlen = ip_hdrlen(skb);
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	unsigned long statusbit;
-	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
-
-	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
-		return 0;
-
-	inside = (void *)skb->data + hdrlen;
-
-	/* We're actually going to mangle it beyond trivial checksum
-	   adjustment, so make sure the current checksum is correct. */
-	if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
-		return 0;
-
-	/* Must be RELATED */
-	NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
-		     skb->nfctinfo == IP_CT_RELATED_REPLY);
-
-	/* Redirects on non-null nats must be dropped, else they'll
-	   start talking to each other without our translation, and be
-	   confused... --RR */
-	if (inside->icmp.type == ICMP_REDIRECT) {
-		/* If NAT isn't finished, assume it and drop. */
-		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
-			return 0;
-
-		if (ct->status & IPS_NAT_MASK)
-			return 0;
-	}
-
-	if (manip == NF_NAT_MANIP_SRC)
-		statusbit = IPS_SRC_NAT;
-	else
-		statusbit = IPS_DST_NAT;
-
-	/* Invert if this is reply dir. */
-	if (dir == IP_CT_DIR_REPLY)
-		statusbit ^= IPS_NAT_MASK;
-
-	if (!(ct->status & statusbit))
-		return 1;
-
-	pr_debug("icmp_reply_translation: translating error %p manip %u "
-		 "dir %s\n", skb, manip,
-		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
-
-	/* Change inner back to look like incoming packet.  We do the
-	   opposite manip on this hook to normal, because it might not
-	   pass all hooks (locally-generated ICMP).  Consider incoming
-	   packet: PREROUTING (DST manip), routing produces ICMP, goes
-	   through POSTROUTING (which must correct the DST manip). */
-	if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
-		       &ct->tuplehash[!dir].tuple, !manip))
-		return 0;
-
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		/* Reloading "inside" here since manip_pkt inner. */
-		inside = (void *)skb->data + hdrlen;
-		inside->icmp.checksum = 0;
-		inside->icmp.checksum =
-			csum_fold(skb_checksum(skb, hdrlen,
-					       skb->len - hdrlen, 0));
-	}
-
-	/* Change outer to look the reply to an incoming packet
-	 * (proto 0 means don't invert per-proto part). */
-	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-	if (!manip_pkt(0, skb, 0, &target, manip))
-		return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-
-/* Protocol registration. */
-int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
-{
-	int ret = 0;
-
-	spin_lock_bh(&nf_nat_lock);
-	if (rcu_dereference_protected(
-			nf_nat_protos[proto->protonum],
-			lockdep_is_held(&nf_nat_lock)
-			) != &nf_nat_unknown_protocol) {
-		ret = -EBUSY;
-		goto out;
-	}
-	RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto);
- out:
-	spin_unlock_bh(&nf_nat_lock);
-	return ret;
-}
-EXPORT_SYMBOL(nf_nat_protocol_register);
-
-/* No one stores the protocol anywhere; simply delete it. */
-void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
-{
-	spin_lock_bh(&nf_nat_lock);
-	RCU_INIT_POINTER(nf_nat_protos[proto->protonum],
-			   &nf_nat_unknown_protocol);
-	spin_unlock_bh(&nf_nat_lock);
-	synchronize_rcu();
-}
-EXPORT_SYMBOL(nf_nat_protocol_unregister);
-
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
-	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
-	if (nat == NULL || nat->ct == NULL)
-		return;
-
-	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
-
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&nat->bysource);
-	spin_unlock_bh(&nf_nat_lock);
-}
-
-static void nf_nat_move_storage(void *new, void *old)
-{
-	struct nf_conn_nat *new_nat = new;
-	struct nf_conn_nat *old_nat = old;
-	struct nf_conn *ct = old_nat->ct;
-
-	if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
-		return;
-
-	spin_lock_bh(&nf_nat_lock);
-	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
-	spin_unlock_bh(&nf_nat_lock);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
-	.len		= sizeof(struct nf_conn_nat),
-	.align		= __alignof__(struct nf_conn_nat),
-	.destroy	= nf_nat_cleanup_conntrack,
-	.move		= nf_nat_move_storage,
-	.id		= NF_CT_EXT_NAT,
-	.flags		= NF_CT_EXT_F_PREALLOC,
-};
-
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
-	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
-	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
-};
-
-static int nfnetlink_parse_nat_proto(struct nlattr *attr,
-				     const struct nf_conn *ct,
-				     struct nf_nat_ipv4_range *range)
-{
-	struct nlattr *tb[CTA_PROTONAT_MAX+1];
-	const struct nf_nat_protocol *npt;
-	int err;
-
-	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
-	if (err < 0)
-		return err;
-
-	rcu_read_lock();
-	npt = __nf_nat_proto_find(nf_ct_protonum(ct));
-	if (npt->nlattr_to_range)
-		err = npt->nlattr_to_range(tb, range);
-	rcu_read_unlock();
-	return err;
-}
-
-static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
-	[CTA_NAT_MINIP]		= { .type = NLA_U32 },
-	[CTA_NAT_MAXIP]		= { .type = NLA_U32 },
-	[CTA_NAT_PROTO]		= { .type = NLA_NESTED },
-};
-
-static int
-nfnetlink_parse_nat(const struct nlattr *nat,
-		    const struct nf_conn *ct, struct nf_nat_ipv4_range *range)
-{
-	struct nlattr *tb[CTA_NAT_MAX+1];
-	int err;
-
-	memset(range, 0, sizeof(*range));
-
-	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
-	if (err < 0)
-		return err;
-
-	if (tb[CTA_NAT_MINIP])
-		range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
-
-	if (!tb[CTA_NAT_MAXIP])
-		range->max_ip = range->min_ip;
-	else
-		range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
-
-	if (range->min_ip)
-		range->flags |= NF_NAT_RANGE_MAP_IPS;
-
-	if (!tb[CTA_NAT_PROTO])
-		return 0;
-
-	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
-	if (err < 0)
-		return err;
-
-	return 0;
-}
-
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
-			  enum nf_nat_manip_type manip,
-			  const struct nlattr *attr)
-{
-	struct nf_nat_ipv4_range range;
-
-	if (nfnetlink_parse_nat(attr, ct, &range) < 0)
-		return -EINVAL;
-	if (nf_nat_initialized(ct, manip))
-		return -EEXIST;
-
-	return nf_nat_setup_info(ct, &range, manip);
-}
-#else
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
-			  enum nf_nat_manip_type manip,
-			  const struct nlattr *attr)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
-static int __net_init nf_nat_net_init(struct net *net)
-{
-	/* Leave them the same for the moment. */
-	net->ipv4.nat_htable_size = net->ct.htable_size;
-	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
-	if (!net->ipv4.nat_bysource)
-		return -ENOMEM;
-	return 0;
-}
-
-/* Clear NAT section of all conntracks, in case we're loaded again. */
-static int clean_nat(struct nf_conn *i, void *data)
-{
-	struct nf_conn_nat *nat = nfct_nat(i);
-
-	if (!nat)
-		return 0;
-	memset(nat, 0, sizeof(*nat));
-	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
-	return 0;
-}
-
-static void __net_exit nf_nat_net_exit(struct net *net)
-{
-	nf_ct_iterate_cleanup(net, &clean_nat, NULL);
-	synchronize_rcu();
-	nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
-}
-
-static struct pernet_operations nf_nat_net_ops = {
-	.init = nf_nat_net_init,
-	.exit = nf_nat_net_exit,
-};
-
-static struct nf_ct_helper_expectfn follow_master_nat = {
-	.name		= "nat-follow-master",
-	.expectfn	= nf_nat_follow_master,
-};
-
-static struct nfq_ct_nat_hook nfq_ct_nat = {
-	.seq_adjust	= nf_nat_tcp_seq_adjust,
-};
-
-static int __init nf_nat_init(void)
-{
-	size_t i;
-	int ret;
-
-	need_ipv4_conntrack();
-
-	ret = nf_ct_extend_register(&nat_extend);
-	if (ret < 0) {
-		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
-		return ret;
-	}
-
-	ret = register_pernet_subsys(&nf_nat_net_ops);
-	if (ret < 0)
-		goto cleanup_extend;
-
-	/* Sew in builtin protocols. */
-	spin_lock_bh(&nf_nat_lock);
-	for (i = 0; i < MAX_IP_NAT_PROTO; i++)
-		RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol);
-	RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
-	RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
-	RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
-	spin_unlock_bh(&nf_nat_lock);
-
-	/* Initialize fake conntrack so that NAT will skip it */
-	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
-
-	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
-
-	nf_ct_helper_expectfn_register(&follow_master_nat);
-
-	BUG_ON(nf_nat_seq_adjust_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
-	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
-			   nfnetlink_parse_nat_setup);
-	BUG_ON(nf_ct_nat_offset != NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
-	return 0;
-
- cleanup_extend:
-	nf_ct_extend_unregister(&nat_extend);
-	return ret;
-}
-
-static void __exit nf_nat_cleanup(void)
-{
-	unregister_pernet_subsys(&nf_nat_net_ops);
-	nf_ct_l3proto_put(l3proto);
-	nf_ct_extend_unregister(&nat_extend);
-	nf_ct_helper_expectfn_unregister(&follow_master_nat);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
-	synchronize_net();
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-ipv4");
-
-module_init(nf_nat_init);
-module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 5589f3af4a8e..dd5e387fc03b 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -15,7 +15,6 @@
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_ftp.h>
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index d2c228db38b5..9c3db10b22d3 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,7 +15,6 @@
 
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_h323.h>
@@ -392,7 +391,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_q931_expect(struct nf_conn *new,
 			       struct nf_conntrack_expect *this)
 {
-	struct nf_nat_ipv4_range range;
+	struct nf_nat_range range;
 
 	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
 		nf_nat_follow_master(new, this);
@@ -404,14 +403,15 @@ static void ip_nat_q931_expect(struct nf_conn *new,
 
 	/* Change src to where master sends to */
 	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+	range.min_addr = range.max_addr =
+	    new->tuplehash[!this->dir].tuple.src.u3;
 	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
 
 	/* For DST manip, map port here to where it's expected. */
 	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-	range.min = range.max = this->saved_proto;
-	range.min_ip = range.max_ip =
-	    new->master->tuplehash[!this->dir].tuple.src.u3.ip;
+	range.min_proto = range.max_proto = this->saved_proto;
+	range.min_addr = range.max_addr =
+	    new->master->tuplehash[!this->dir].tuple.src.u3;
 	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
 }
 
@@ -490,20 +490,21 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_callforwarding_expect(struct nf_conn *new,
 					 struct nf_conntrack_expect *this)
 {
-	struct nf_nat_ipv4_range range;
+	struct nf_nat_range range;
 
 	/* This must be a fresh one. */
 	BUG_ON(new->status & IPS_NAT_DONE_MASK);
 
 	/* Change src to where master sends to */
 	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+	range.min_addr = range.max_addr =
+	    new->tuplehash[!this->dir].tuple.src.u3;
 	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
 
 	/* For DST manip, map port here to where it's expected. */
 	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-	range.min = range.max = this->saved_proto;
-	range.min_ip = range.max_ip = this->saved_ip;
+	range.min_proto = range.max_proto = this->saved_proto;
+	range.min_addr = range.max_addr = this->saved_addr;
 	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
 }
 
@@ -519,7 +520,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 	u_int16_t nated_port;
 
 	/* Set expectations for NAT */
-	exp->saved_ip = exp->tuple.dst.u3.ip;
+	exp->saved_addr = exp->tuple.dst.u3;
 	exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
 	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
 	exp->expectfn = ip_nat_callforwarding_expect;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
deleted file mode 100644
index 2fefec5e757c..000000000000
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/* ip_nat_helper.c - generic support functions for NAT helpers
- *
- * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
- * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/gfp.h>
-#include <linux/kmod.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-
-#define DUMP_OFFSET(x) \
-	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
-		 x->offset_before, x->offset_after, x->correction_pos);
-
-static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
-
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
-		    int sizediff,
-		    struct nf_conn *ct,
-		    enum ip_conntrack_info ctinfo)
-{
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way = &nat->seq[dir];
-
-	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
-		 seq, sizediff);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset before: ");
-	DUMP_OFFSET(this_way);
-
-	spin_lock_bh(&nf_nat_seqofs_lock);
-
-	/* SYN adjust. If it's uninitialized, or this is after last
-	 * correction, record it: we don't handle more than one
-	 * adjustment in the window, but do deal with common case of a
-	 * retransmit */
-	if (this_way->offset_before == this_way->offset_after ||
-	    before(this_way->correction_pos, seq)) {
-		this_way->correction_pos = seq;
-		this_way->offset_before = this_way->offset_after;
-		this_way->offset_after += sizediff;
-	}
-	spin_unlock_bh(&nf_nat_seqofs_lock);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset after: ");
-	DUMP_OFFSET(this_way);
-}
-
-/* Get the offset value, for conntrack */
-s16 nf_nat_get_offset(const struct nf_conn *ct,
-		      enum ip_conntrack_dir dir,
-		      u32 seq)
-{
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way;
-	s16 offset;
-
-	if (!nat)
-		return 0;
-
-	this_way = &nat->seq[dir];
-	spin_lock_bh(&nf_nat_seqofs_lock);
-	offset = after(seq, this_way->correction_pos)
-		 ? this_way->offset_after : this_way->offset_before;
-	spin_unlock_bh(&nf_nat_seqofs_lock);
-
-	return offset;
-}
-EXPORT_SYMBOL_GPL(nf_nat_get_offset);
-
-/* Frobs data inside this packet, which is linear. */
-static void mangle_contents(struct sk_buff *skb,
-			    unsigned int dataoff,
-			    unsigned int match_offset,
-			    unsigned int match_len,
-			    const char *rep_buffer,
-			    unsigned int rep_len)
-{
-	unsigned char *data;
-
-	BUG_ON(skb_is_nonlinear(skb));
-	data = skb_network_header(skb) + dataoff;
-
-	/* move post-replacement */
-	memmove(data + match_offset + rep_len,
-		data + match_offset + match_len,
-		skb->tail - (skb->network_header + dataoff +
-			     match_offset + match_len));
-
-	/* insert data from buffer */
-	memcpy(data + match_offset, rep_buffer, rep_len);
-
-	/* update skb info */
-	if (rep_len > match_len) {
-		pr_debug("nf_nat_mangle_packet: Extending packet by "
-			 "%u from %u bytes\n", rep_len - match_len, skb->len);
-		skb_put(skb, rep_len - match_len);
-	} else {
-		pr_debug("nf_nat_mangle_packet: Shrinking packet from "
-			 "%u from %u bytes\n", match_len - rep_len, skb->len);
-		__skb_trim(skb, skb->len + rep_len - match_len);
-	}
-
-	/* fix IP hdr checksum information */
-	ip_hdr(skb)->tot_len = htons(skb->len);
-	ip_send_check(ip_hdr(skb));
-}
-
-/* Unusual, but possible case. */
-static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
-{
-	if (skb->len + extra > 65535)
-		return 0;
-
-	if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
-		return 0;
-
-	return 1;
-}
-
-void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-			   __be32 seq, s16 off)
-{
-	if (!off)
-		return;
-	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-	adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
-	nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
-}
-EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
-
-void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, int off)
-{
-	const struct tcphdr *th;
-
-	if (nf_ct_protonum(ct) != IPPROTO_TCP)
-		return;
-
-	th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
-
-static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
-			int datalen, __sum16 *check, int oldlen)
-{
-	struct rtable *rt = skb_rtable(skb);
-
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  iph->ihl * 4;
-			skb->csum_offset = (void *)check - data;
-			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-						    datalen, iph->protocol, 0);
-		} else {
-			*check = 0;
-			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-						   datalen, iph->protocol,
-						   csum_partial(data, datalen,
-								0));
-			if (iph->protocol == IPPROTO_UDP && !*check)
-				*check = CSUM_MANGLED_0;
-		}
-	} else
-		inet_proto_csum_replace2(check, skb,
-					 htons(oldlen), htons(datalen), 1);
-}
-
-/* Generic function for mangling variable-length address changes inside
- * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
- * command in FTP).
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * */
-int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
-			       struct nf_conn *ct,
-			       enum ip_conntrack_info ctinfo,
-			       unsigned int protoff,
-			       unsigned int match_offset,
-			       unsigned int match_len,
-			       const char *rep_buffer,
-			       unsigned int rep_len, bool adjust)
-{
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	int oldlen, datalen;
-
-	if (!skb_make_writable(skb, skb->len))
-		return 0;
-
-	if (rep_len > match_len &&
-	    rep_len - match_len > skb_tailroom(skb) &&
-	    !enlarge_skb(skb, rep_len - match_len))
-		return 0;
-
-	SKB_LINEAR_ASSERT(skb);
-
-	iph = ip_hdr(skb);
-	tcph = (void *)iph + iph->ihl*4;
-
-	oldlen = skb->len - iph->ihl*4;
-	mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
-			match_offset, match_len, rep_buffer, rep_len);
-
-	datalen = skb->len - iph->ihl*4;
-	nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
-
-	if (adjust && rep_len != match_len)
-		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
-				      (int)rep_len - (int)match_len);
-
-	return 1;
-}
-EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
-
-/* Generic function for mangling variable-length address changes inside
- * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
- * command in the Amanda protocol)
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
- *       should be fairly easy to do.
- */
-int
-nf_nat_mangle_udp_packet(struct sk_buff *skb,
-			 struct nf_conn *ct,
-			 enum ip_conntrack_info ctinfo,
-			 unsigned int protoff,
-			 unsigned int match_offset,
-			 unsigned int match_len,
-			 const char *rep_buffer,
-			 unsigned int rep_len)
-{
-	struct iphdr *iph;
-	struct udphdr *udph;
-	int datalen, oldlen;
-
-	if (!skb_make_writable(skb, skb->len))
-		return 0;
-
-	if (rep_len > match_len &&
-	    rep_len - match_len > skb_tailroom(skb) &&
-	    !enlarge_skb(skb, rep_len - match_len))
-		return 0;
-
-	iph = ip_hdr(skb);
-	udph = (void *)iph + iph->ihl*4;
-
-	oldlen = skb->len - iph->ihl*4;
-	mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
-			match_offset, match_len, rep_buffer, rep_len);
-
-	/* update the length of the UDP packet */
-	datalen = skb->len - iph->ihl*4;
-	udph->len = htons(datalen);
-
-	/* fix udp checksum if udp checksum was previously calculated */
-	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
-		return 1;
-
-	nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
-
-	return 1;
-}
-EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
-
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
-	    struct tcphdr *tcph,
-	    unsigned int sackoff,
-	    unsigned int sackend,
-	    struct nf_nat_seq *natseq)
-{
-	while (sackoff < sackend) {
-		struct tcp_sack_block_wire *sack;
-		__be32 new_start_seq, new_end_seq;
-
-		sack = (void *)skb->data + sackoff;
-		if (after(ntohl(sack->start_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_after);
-		else
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_before);
-
-		if (after(ntohl(sack->end_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_after);
-		else
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_before);
-
-		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
-			 ntohl(sack->start_seq), new_start_seq,
-			 ntohl(sack->end_seq), new_end_seq);
-
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->start_seq, new_start_seq, 0);
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->end_seq, new_end_seq, 0);
-		sack->start_seq = new_start_seq;
-		sack->end_seq = new_end_seq;
-		sackoff += sizeof(*sack);
-	}
-}
-
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-nf_nat_sack_adjust(struct sk_buff *skb,
-		   struct tcphdr *tcph,
-		   struct nf_conn *ct,
-		   enum ip_conntrack_info ctinfo)
-{
-	unsigned int dir, optoff, optend;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-
-	optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
-	optend = ip_hdrlen(skb) + tcph->doff * 4;
-
-	if (!skb_make_writable(skb, optend))
-		return 0;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	while (optoff < optend) {
-		/* Usually: option, length. */
-		unsigned char *op = skb->data + optoff;
-
-		switch (op[0]) {
-		case TCPOPT_EOL:
-			return 1;
-		case TCPOPT_NOP:
-			optoff++;
-			continue;
-		default:
-			/* no partial options */
-			if (optoff + 1 == optend ||
-			    optoff + op[1] > optend ||
-			    op[1] < 2)
-				return 0;
-			if (op[0] == TCPOPT_SACK &&
-			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
-			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
-				sack_adjust(skb, tcph, optoff+2,
-					    optoff+op[1], &nat->seq[!dir]);
-			optoff += op[1];
-		}
-	}
-	return 1;
-}
-
-/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
-int
-nf_nat_seq_adjust(struct sk_buff *skb,
-		  struct nf_conn *ct,
-		  enum ip_conntrack_info ctinfo,
-		  unsigned int protoff)
-{
-	struct tcphdr *tcph;
-	int dir;
-	__be32 newseq, newack;
-	s16 seqoff, ackoff;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way, *other_way;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	this_way = &nat->seq[dir];
-	other_way = &nat->seq[!dir];
-
-	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
-		return 0;
-
-	tcph = (void *)skb->data + protoff;
-	if (after(ntohl(tcph->seq), this_way->correction_pos))
-		seqoff = this_way->offset_after;
-	else
-		seqoff = this_way->offset_before;
-
-	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
-		  other_way->correction_pos))
-		ackoff = other_way->offset_after;
-	else
-		ackoff = other_way->offset_before;
-
-	newseq = htonl(ntohl(tcph->seq) + seqoff);
-	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
-
-	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
-		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
-		 ntohl(newack));
-
-	tcph->seq = newseq;
-	tcph->ack_seq = newack;
-
-	return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
-}
-
-/* Setup NAT on this expected conntrack so it follows master. */
-/* If we fail to get a free NAT slot, we'll get dropped on confirm */
-void nf_nat_follow_master(struct nf_conn *ct,
-			  struct nf_conntrack_expect *exp)
-{
-	struct nf_nat_ipv4_range range;
-
-	/* This must be a fresh one. */
-	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
-	/* Change src to where master sends to */
-	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_ip = range.max_ip
-		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
-	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
-
-	/* For DST manip, map port here to where it's expected. */
-	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-	range.min = range.max = exp->saved_proto;
-	range.min_ip = range.max_ip
-		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
-	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
-}
-EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index 5b0c20a1f08d..1ce37f89ec78 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -17,7 +17,6 @@
 
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_irc.h>
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 000000000000..d8b2e14efddc
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,281 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+				       const struct nf_conn *ct,
+				       enum ip_conntrack_dir dir,
+				       unsigned long statusbit,
+				       struct flowi *fl)
+{
+	const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+	struct flowi4 *fl4 = &fl->u.ip4;
+
+	if (ct->status & statusbit) {
+		fl4->daddr = t->dst.u3.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP ||
+		    t->dst.protonum == IPPROTO_UDPLITE ||
+		    t->dst.protonum == IPPROTO_DCCP ||
+		    t->dst.protonum == IPPROTO_SCTP)
+			fl4->fl4_dport = t->dst.u.all;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl4->saddr = t->src.u3.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP ||
+		    t->dst.protonum == IPPROTO_UDPLITE ||
+		    t->dst.protonum == IPPROTO_DCCP ||
+		    t->dst.protonum == IPPROTO_SCTP)
+			fl4->fl4_sport = t->src.u.all;
+	}
+}
+#endif /* CONFIG_XFRM */
+
+static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
+				 const struct nf_nat_range *range)
+{
+	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+}
+
+static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
+				   __be16 dport)
+{
+	return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
+}
+
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+				  unsigned int iphdroff,
+				  const struct nf_nat_l4proto *l4proto,
+				  const struct nf_conntrack_tuple *target,
+				  enum nf_nat_manip_type maniptype)
+{
+	struct iphdr *iph;
+	unsigned int hdroff;
+
+	if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+		return false;
+
+	iph = (void *)skb->data + iphdroff;
+	hdroff = iphdroff + iph->ihl * 4;
+
+	if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
+				target, maniptype))
+		return false;
+	iph = (void *)skb->data + iphdroff;
+
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+		iph->saddr = target->src.u3.ip;
+	} else {
+		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+		iph->daddr = target->dst.u3.ip;
+	}
+	return true;
+}
+
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+				    unsigned int iphdroff, __sum16 *check,
+				    const struct nf_conntrack_tuple *t,
+				    enum nf_nat_manip_type maniptype)
+{
+	struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	__be32 oldip, newip;
+
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		oldip = iph->saddr;
+		newip = t->src.u3.ip;
+	} else {
+		oldip = iph->daddr;
+		newip = t->dst.u3.ip;
+	}
+	inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+}
+
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+				    u8 proto, void *data, __sum16 *check,
+				    int datalen, int oldlen)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt = skb_rtable(skb);
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		if (!(rt->rt_flags & RTCF_LOCAL) &&
+		    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_headroom(skb) +
+					  skb_network_offset(skb) +
+					  ip_hdrlen(skb);
+			skb->csum_offset = (void *)check - data;
+			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						    datalen, proto, 0);
+		} else {
+			*check = 0;
+			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						   datalen, proto,
+						   csum_partial(data, datalen,
+								0));
+			if (proto == IPPROTO_UDP && !*check)
+				*check = CSUM_MANGLED_0;
+		}
+	} else
+		inet_proto_csum_replace2(check, skb,
+					 htons(oldlen), htons(datalen), 1);
+}
+
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+				       struct nf_nat_range *range)
+{
+	if (tb[CTA_NAT_V4_MINIP]) {
+		range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+		range->flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (tb[CTA_NAT_V4_MAXIP])
+		range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
+	else
+		range->max_addr.ip = range->min_addr.ip;
+
+	return 0;
+}
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
+	.l3proto		= NFPROTO_IPV4,
+	.in_range		= nf_nat_ipv4_in_range,
+	.secure_port		= nf_nat_ipv4_secure_port,
+	.manip_pkt		= nf_nat_ipv4_manip_pkt,
+	.csum_update		= nf_nat_ipv4_csum_update,
+	.csum_recalc		= nf_nat_ipv4_csum_recalc,
+	.nlattr_to_range	= nf_nat_ipv4_nlattr_to_range,
+#ifdef CONFIG_XFRM
+	.decode_session		= nf_nat_ipv4_decode_session,
+#endif
+};
+
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+				  struct nf_conn *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned int hooknum)
+{
+	struct {
+		struct icmphdr	icmp;
+		struct iphdr	ip;
+	} *inside;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+	unsigned int hdrlen = ip_hdrlen(skb);
+	const struct nf_nat_l4proto *l4proto;
+	struct nf_conntrack_tuple target;
+	unsigned long statusbit;
+
+	NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+		return 0;
+	if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+		return 0;
+
+	inside = (void *)skb->data + hdrlen;
+	if (inside->icmp.type == ICMP_REDIRECT) {
+		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+			return 0;
+		if (ct->status & IPS_NAT_MASK)
+			return 0;
+	}
+
+	if (manip == NF_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply direction */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	if (!(ct->status & statusbit))
+		return 1;
+
+	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
+	if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+				   l4proto, &ct->tuplehash[!dir].tuple, !manip))
+		return 0;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* Reloading "inside" here since manip_pkt may reallocate */
+		inside = (void *)skb->data + hdrlen;
+		inside->icmp.checksum = 0;
+		inside->icmp.checksum =
+			csum_fold(skb_checksum(skb, hdrlen,
+					       skb->len - hdrlen, 0));
+	}
+
+	/* Change outer to look like the reply to an incoming packet */
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
+	if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+static int __init nf_nat_l3proto_ipv4_init(void)
+{
+	int err;
+
+	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+	if (err < 0)
+		goto err1;
+	err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
+	if (err < 0)
+		goto err2;
+	return err;
+
+err2:
+	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+err1:
+	return err;
+}
+
+static void __exit nf_nat_l3proto_ipv4_exit(void)
+{
+	nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
+	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
+
+module_init(nf_nat_l3proto_ipv4_init);
+module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 31ef890d894b..a06d7d74817d 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,7 +22,6 @@
 
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -47,7 +46,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	struct nf_conntrack_tuple t;
 	const struct nf_ct_pptp_master *ct_pptp_info;
 	const struct nf_nat_pptp *nat_pptp_info;
-	struct nf_nat_ipv4_range range;
+	struct nf_nat_range range;
 
 	ct_pptp_info = nfct_help_data(master);
 	nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
@@ -89,21 +88,21 @@ static void pptp_nat_expected(struct nf_conn *ct,
 
 	/* Change src to where master sends to */
 	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_ip = range.max_ip
-		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+	range.min_addr = range.max_addr
+		= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
 	if (exp->dir == IP_CT_DIR_ORIGINAL) {
 		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-		range.min = range.max = exp->saved_proto;
+		range.min_proto = range.max_proto = exp->saved_proto;
 	}
 	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
 
 	/* For DST manip, map port here to where it's expected. */
 	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_ip = range.max_ip
-		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
+	range.min_addr = range.max_addr
+		= ct->master->tuplehash[!exp->dir].tuple.src.u3;
 	if (exp->dir == IP_CT_DIR_REPLY) {
 		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-		range.min = range.max = exp->saved_proto;
+		range.min_proto = range.max_proto = exp->saved_proto;
 	}
 	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
 }
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 9993bc93e102..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/random.h>
-#include <linux/ip.h>
-
-#include <linux/netfilter.h>
-#include <linux/export.h>
-#include <net/secure_seq.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
-			   enum nf_nat_manip_type maniptype,
-			   const union nf_conntrack_man_proto *min,
-			   const union nf_conntrack_man_proto *max)
-{
-	__be16 port;
-
-	if (maniptype == NF_NAT_MANIP_SRC)
-		port = tuple->src.u.all;
-	else
-		port = tuple->dst.u.all;
-
-	return ntohs(port) >= ntohs(min->all) &&
-	       ntohs(port) <= ntohs(max->all);
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
-
-void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
-			       const struct nf_nat_ipv4_range *range,
-			       enum nf_nat_manip_type maniptype,
-			       const struct nf_conn *ct,
-			       u_int16_t *rover)
-{
-	unsigned int range_size, min, i;
-	__be16 *portptr;
-	u_int16_t off;
-
-	if (maniptype == NF_NAT_MANIP_SRC)
-		portptr = &tuple->src.u.all;
-	else
-		portptr = &tuple->dst.u.all;
-
-	/* If no range specified... */
-	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
-		/* If it's dst rewrite, can't change port */
-		if (maniptype == NF_NAT_MANIP_DST)
-			return;
-
-		if (ntohs(*portptr) < 1024) {
-			/* Loose convention: >> 512 is credential passing */
-			if (ntohs(*portptr) < 512) {
-				min = 1;
-				range_size = 511 - min + 1;
-			} else {
-				min = 600;
-				range_size = 1023 - min + 1;
-			}
-		} else {
-			min = 1024;
-			range_size = 65535 - 1024 + 1;
-		}
-	} else {
-		min = ntohs(range->min.all);
-		range_size = ntohs(range->max.all) - min + 1;
-	}
-
-	if (range->flags & NF_NAT_RANGE_PROTO_RANDOM)
-		off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
-						 maniptype == NF_NAT_MANIP_SRC
-						 ? tuple->dst.u.all
-						 : tuple->src.u.all);
-	else
-		off = *rover;
-
-	for (i = 0; ; ++off) {
-		*portptr = htons(min + off % range_size);
-		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
-			continue;
-		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM))
-			*rover = off;
-		return;
-	}
-	return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
-
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
-				 struct nf_nat_ipv4_range *range)
-{
-	if (tb[CTA_PROTONAT_PORT_MIN]) {
-		range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
-		range->max.all = range->min.tcp.port;
-		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-	if (tb[CTA_PROTONAT_PORT_MAX]) {
-		range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
-		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
-#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 3f67138d187c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * DCCP NAT protocol helper
- *
- * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/dccp.h>
-
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t dccp_port_rover;
-
-static void
-dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_ipv4_range *range,
-		  enum nf_nat_manip_type maniptype,
-		  const struct nf_conn *ct)
-{
-	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-				  &dccp_port_rover);
-}
-
-static bool
-dccp_manip_pkt(struct sk_buff *skb,
-	       unsigned int iphdroff,
-	       const struct nf_conntrack_tuple *tuple,
-	       enum nf_nat_manip_type maniptype)
-{
-	const struct iphdr *iph = (const void *)(skb->data + iphdroff);
-	struct dccp_hdr *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl * 4;
-	__be32 oldip, newip;
-	__be16 *portptr, oldport, newport;
-	int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
-	if (skb->len >= hdroff + sizeof(struct dccp_hdr))
-		hdrsize = sizeof(struct dccp_hdr);
-
-	if (!skb_make_writable(skb, hdroff + hdrsize))
-		return false;
-
-	iph = (struct iphdr *)(skb->data + iphdroff);
-	hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		oldip = iph->saddr;
-		newip = tuple->src.u3.ip;
-		newport = tuple->src.u.dccp.port;
-		portptr = &hdr->dccph_sport;
-	} else {
-		oldip = iph->daddr;
-		newip = tuple->dst.u3.ip;
-		newport = tuple->dst.u.dccp.port;
-		portptr = &hdr->dccph_dport;
-	}
-
-	oldport = *portptr;
-	*portptr = newport;
-
-	if (hdrsize < sizeof(*hdr))
-		return true;
-
-	inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
-	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
-				 0);
-	return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_dccp = {
-	.protonum		= IPPROTO_DCCP,
-	.manip_pkt		= dccp_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
-	.unique_tuple		= dccp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_dccp_init(void)
-{
-	return nf_nat_protocol_register(&nf_nat_protocol_dccp);
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
-	nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 46ba0b9ab985..ea44f02563b5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,8 +28,7 @@
 #include <linux/ip.h>
 
 #include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_l4proto.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
 
 MODULE_LICENSE("GPL");
@@ -38,8 +37,9 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 
 /* generate unique tuple ... */
 static void
-gre_unique_tuple(struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_ipv4_range *range,
+gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
+		 struct nf_conntrack_tuple *tuple,
+		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
@@ -62,8 +62,8 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 		min = 1;
 		range_size = 0xffff;
 	} else {
-		min = ntohs(range->min.gre.key);
-		range_size = ntohs(range->max.gre.key) - min + 1;
+		min = ntohs(range->min_proto.gre.key);
+		range_size = ntohs(range->max_proto.gre.key) - min + 1;
 	}
 
 	pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 
 /* manipulate a GRE packet according to maniptype */
 static bool
-gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
+gre_manip_pkt(struct sk_buff *skb,
+	      const struct nf_nat_l3proto *l3proto,
+	      unsigned int iphdroff, unsigned int hdroff,
 	      const struct nf_conntrack_tuple *tuple,
 	      enum nf_nat_manip_type maniptype)
 {
 	const struct gre_hdr *greh;
 	struct gre_hdr_pptp *pgreh;
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-	unsigned int hdroff = iphdroff + iph->ihl * 4;
 
 	/* pgreh includes two optional 32bit fields which are not required
 	 * to be there.  That's where the magic '8' comes from */
@@ -117,24 +117,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
 	return true;
 }
 
-static const struct nf_nat_protocol gre = {
-	.protonum		= IPPROTO_GRE,
+static const struct nf_nat_l4proto gre = {
+	.l4proto		= IPPROTO_GRE,
 	.manip_pkt		= gre_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
+	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= gre_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
 
 static int __init nf_nat_proto_gre_init(void)
 {
-	return nf_nat_protocol_register(&gre);
+	return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
 }
 
 static void __exit nf_nat_proto_gre_fini(void)
 {
-	nf_nat_protocol_unregister(&gre);
+	nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
 }
 
 module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index b35172851bae..eb303471bcf6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -15,8 +15,7 @@
 #include <linux/netfilter.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_l4proto.h>
 
 static bool
 icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -29,8 +28,9 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 }
 
 static void
-icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_ipv4_range *range,
+icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+		  struct nf_conntrack_tuple *tuple,
+		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
@@ -38,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 	unsigned int range_size;
 	unsigned int i;
 
-	range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
+	range_size = ntohs(range->max_proto.icmp.id) -
+		     ntohs(range->min_proto.icmp.id) + 1;
 	/* If no range specified... */
 	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
 		range_size = 0xFFFF;
 
 	for (i = 0; ; ++id) {
-		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+		tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
 					     (id % range_size));
 		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
 			return;
@@ -54,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 
 static bool
 icmp_manip_pkt(struct sk_buff *skb,
-	       unsigned int iphdroff,
+	       const struct nf_nat_l3proto *l3proto,
+	       unsigned int iphdroff, unsigned int hdroff,
 	       const struct nf_conntrack_tuple *tuple,
 	       enum nf_nat_manip_type maniptype)
 {
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
 	struct icmphdr *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl*4;
 
 	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
 		return false;
@@ -72,12 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,
 	return true;
 }
 
-const struct nf_nat_protocol nf_nat_protocol_icmp = {
-	.protonum		= IPPROTO_ICMP,
+const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
+	.l4proto		= IPPROTO_ICMP,
 	.manip_pkt		= icmp_manip_pkt,
 	.in_range		= icmp_in_range,
 	.unique_tuple		= icmp_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 3cce9b6c1c29..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/sctp.h>
-#include <linux/module.h>
-#include <net/sctp/checksum.h>
-
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t nf_sctp_port_rover;
-
-static void
-sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_ipv4_range *range,
-		  enum nf_nat_manip_type maniptype,
-		  const struct nf_conn *ct)
-{
-	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-				  &nf_sctp_port_rover);
-}
-
-static bool
-sctp_manip_pkt(struct sk_buff *skb,
-	       unsigned int iphdroff,
-	       const struct nf_conntrack_tuple *tuple,
-	       enum nf_nat_manip_type maniptype)
-{
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-	struct sk_buff *frag;
-	sctp_sctphdr_t *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl*4;
-	__be32 oldip, newip;
-	__be32 crc32;
-
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-
-	iph = (struct iphdr *)(skb->data + iphdroff);
-	hdr = (struct sctphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of src ip and src pt */
-		oldip = iph->saddr;
-		newip = tuple->src.u3.ip;
-		hdr->source = tuple->src.u.sctp.port;
-	} else {
-		/* Get rid of dst ip and dst pt */
-		oldip = iph->daddr;
-		newip = tuple->dst.u3.ip;
-		hdr->dest = tuple->dst.u.sctp.port;
-	}
-
-	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
-	skb_walk_frags(skb, frag)
-		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
-					  crc32);
-	crc32 = sctp_end_cksum(crc32);
-	hdr->checksum = crc32;
-
-	return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_sctp = {
-	.protonum		= IPPROTO_SCTP,
-	.manip_pkt		= sctp_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
-	.unique_tuple		= sctp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_sctp_init(void)
-{
-	return nf_nat_protocol_register(&nf_nat_protocol_sctp);
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
-	nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index 9fb4b4e72bbf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-
-static u_int16_t tcp_port_rover;
-
-static void
-tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_ipv4_range *range,
-		 enum nf_nat_manip_type maniptype,
-		 const struct nf_conn *ct)
-{
-	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
-}
-
-static bool
-tcp_manip_pkt(struct sk_buff *skb,
-	      unsigned int iphdroff,
-	      const struct nf_conntrack_tuple *tuple,
-	      enum nf_nat_manip_type maniptype)
-{
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-	struct tcphdr *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl*4;
-	__be32 oldip, newip;
-	__be16 *portptr, newport, oldport;
-	int hdrsize = 8; /* TCP connection tracking guarantees this much */
-
-	/* this could be a inner header returned in icmp packet; in such
-	   cases we cannot update the checksum field since it is outside of
-	   the 8 bytes of transport layer headers we are guaranteed */
-	if (skb->len >= hdroff + sizeof(struct tcphdr))
-		hdrsize = sizeof(struct tcphdr);
-
-	if (!skb_make_writable(skb, hdroff + hdrsize))
-		return false;
-
-	iph = (struct iphdr *)(skb->data + iphdroff);
-	hdr = (struct tcphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of src ip and src pt */
-		oldip = iph->saddr;
-		newip = tuple->src.u3.ip;
-		newport = tuple->src.u.tcp.port;
-		portptr = &hdr->source;
-	} else {
-		/* Get rid of dst ip and dst pt */
-		oldip = iph->daddr;
-		newip = tuple->dst.u3.ip;
-		newport = tuple->dst.u.tcp.port;
-		portptr = &hdr->dest;
-	}
-
-	oldport = *portptr;
-	*portptr = newport;
-
-	if (hdrsize < sizeof(*hdr))
-		return true;
-
-	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
-	return true;
-}
-
-const struct nf_nat_protocol nf_nat_protocol_tcp = {
-	.protonum		= IPPROTO_TCP,
-	.manip_pkt		= tcp_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
-	.unique_tuple		= tcp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index 9883336e628f..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t udp_port_rover;
-
-static void
-udp_unique_tuple(struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_ipv4_range *range,
-		 enum nf_nat_manip_type maniptype,
-		 const struct nf_conn *ct)
-{
-	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
-}
-
-static bool
-udp_manip_pkt(struct sk_buff *skb,
-	      unsigned int iphdroff,
-	      const struct nf_conntrack_tuple *tuple,
-	      enum nf_nat_manip_type maniptype)
-{
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-	struct udphdr *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl*4;
-	__be32 oldip, newip;
-	__be16 *portptr, newport;
-
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-
-	iph = (struct iphdr *)(skb->data + iphdroff);
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of src ip and src pt */
-		oldip = iph->saddr;
-		newip = tuple->src.u3.ip;
-		newport = tuple->src.u.udp.port;
-		portptr = &hdr->source;
-	} else {
-		/* Get rid of dst ip and dst pt */
-		oldip = iph->daddr;
-		newip = tuple->dst.u3.ip;
-		newport = tuple->dst.u.udp.port;
-		portptr = &hdr->dest;
-	}
-	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
-					 0);
-		if (!hdr->check)
-			hdr->check = CSUM_MANGLED_0;
-	}
-	*portptr = newport;
-	return true;
-}
-
-const struct nf_nat_protocol nf_nat_protocol_udp = {
-	.protonum		= IPPROTO_UDP,
-	.manip_pkt		= udp_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
-	.unique_tuple		= udp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index d24d10a7beb2..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t udplite_port_rover;
-
-static void
-udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_ipv4_range *range,
-		     enum nf_nat_manip_type maniptype,
-		     const struct nf_conn *ct)
-{
-	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-				  &udplite_port_rover);
-}
-
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
-		  unsigned int iphdroff,
-		  const struct nf_conntrack_tuple *tuple,
-		  enum nf_nat_manip_type maniptype)
-{
-	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-	struct udphdr *hdr;
-	unsigned int hdroff = iphdroff + iph->ihl*4;
-	__be32 oldip, newip;
-	__be16 *portptr, newport;
-
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-
-	iph = (struct iphdr *)(skb->data + iphdroff);
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of src ip and src pt */
-		oldip = iph->saddr;
-		newip = tuple->src.u3.ip;
-		newport = tuple->src.u.udp.port;
-		portptr = &hdr->source;
-	} else {
-		/* Get rid of dst ip and dst pt */
-		oldip = iph->daddr;
-		newip = tuple->dst.u3.ip;
-		newport = tuple->dst.u.udp.port;
-		portptr = &hdr->dest;
-	}
-
-	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
-	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
-	if (!hdr->check)
-		hdr->check = CSUM_MANGLED_0;
-
-	*portptr = newport;
-	return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_udplite = {
-	.protonum		= IPPROTO_UDPLITE,
-	.manip_pkt		= udplite_manip_pkt,
-	.in_range		= nf_nat_proto_in_range,
-	.unique_tuple		= udplite_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_udplite_init(void)
-{
-	return nf_nat_protocol_register(&nf_nat_protocol_udplite);
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
-	nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
-}
-
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index e0afe8112b1c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* The "unknown" protocol.  This is what is used for protocols we
- * don't understand.  It's returned by ip_ct_find_proto().
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
-			     enum nf_nat_manip_type manip_type,
-			     const union nf_conntrack_man_proto *min,
-			     const union nf_conntrack_man_proto *max)
-{
-	return true;
-}
-
-static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_ipv4_range *range,
-				 enum nf_nat_manip_type maniptype,
-				 const struct nf_conn *ct)
-{
-	/* Sorry: we can't help you; if it's not unique, we can't frob
-	   anything. */
-	return;
-}
-
-static bool
-unknown_manip_pkt(struct sk_buff *skb,
-		  unsigned int iphdroff,
-		  const struct nf_conntrack_tuple *tuple,
-		  enum nf_nat_manip_type maniptype)
-{
-	return true;
-}
-
-const struct nf_nat_protocol nf_nat_unknown_protocol = {
-	.manip_pkt		= unknown_manip_pkt,
-	.in_range		= unknown_in_range,
-	.unique_tuple		= unknown_unique_tuple,
-};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
deleted file mode 100644
index d2a9dc314e0e..000000000000
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* Everything about the rules for NAT. */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/bitops.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-
-#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
-			 (1 << NF_INET_POST_ROUTING) | \
-			 (1 << NF_INET_LOCAL_OUT) | \
-			 (1 << NF_INET_LOCAL_IN))
-
-static const struct xt_table nat_table = {
-	.name		= "nat",
-	.valid_hooks	= NAT_VALID_HOOKS,
-	.me		= THIS_MODULE,
-	.af		= NFPROTO_IPV4,
-};
-
-/* Source NAT */
-static unsigned int
-ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_IN);
-
-	ct = nf_ct_get(skb, &ctinfo);
-
-	/* Connection must be valid and new. */
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED_REPLY));
-	NF_CT_ASSERT(par->out != NULL);
-
-	return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC);
-}
-
-static unsigned int
-ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_OUT);
-
-	ct = nf_ct_get(skb, &ctinfo);
-
-	/* Connection must be valid and new. */
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
-	return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST);
-}
-
-static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
-{
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	/* Must be a valid range */
-	if (mr->rangesize != 1) {
-		pr_info("SNAT: multiple ranges no longer supported\n");
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
-{
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	/* Must be a valid range */
-	if (mr->rangesize != 1) {
-		pr_info("DNAT: multiple ranges no longer supported\n");
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static unsigned int
-alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-{
-	/* Force range to this IP; let proto decide mapping for
-	   per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED).
-	*/
-	struct nf_nat_ipv4_range range;
-
-	range.flags = 0;
-	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
-		 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
-		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
-		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-
-	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
-}
-
-int nf_nat_rule_find(struct sk_buff *skb,
-		     unsigned int hooknum,
-		     const struct net_device *in,
-		     const struct net_device *out,
-		     struct nf_conn *ct)
-{
-	struct net *net = nf_ct_net(ct);
-	int ret;
-
-	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
-
-	if (ret == NF_ACCEPT) {
-		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			/* NUL mapping */
-			ret = alloc_null_binding(ct, hooknum);
-	}
-	return ret;
-}
-
-static struct xt_target ipt_snat_reg __read_mostly = {
-	.name		= "SNAT",
-	.target		= ipt_snat_target,
-	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),
-	.table		= "nat",
-	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
-	.checkentry	= ipt_snat_checkentry,
-	.family		= AF_INET,
-};
-
-static struct xt_target ipt_dnat_reg __read_mostly = {
-	.name		= "DNAT",
-	.target		= ipt_dnat_target,
-	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),
-	.table		= "nat",
-	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
-	.checkentry	= ipt_dnat_checkentry,
-	.family		= AF_INET,
-};
-
-static int __net_init nf_nat_rule_net_init(struct net *net)
-{
-	struct ipt_replace *repl;
-
-	repl = ipt_alloc_initial_table(&nat_table);
-	if (repl == NULL)
-		return -ENOMEM;
-	net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
-	kfree(repl);
-	if (IS_ERR(net->ipv4.nat_table))
-		return PTR_ERR(net->ipv4.nat_table);
-	return 0;
-}
-
-static void __net_exit nf_nat_rule_net_exit(struct net *net)
-{
-	ipt_unregister_table(net, net->ipv4.nat_table);
-}
-
-static struct pernet_operations nf_nat_rule_net_ops = {
-	.init = nf_nat_rule_net_init,
-	.exit = nf_nat_rule_net_exit,
-};
-
-int __init nf_nat_rule_init(void)
-{
-	int ret;
-
-	ret = register_pernet_subsys(&nf_nat_rule_net_ops);
-	if (ret != 0)
-		goto out;
-	ret = xt_register_target(&ipt_snat_reg);
-	if (ret != 0)
-		goto unregister_table;
-
-	ret = xt_register_target(&ipt_dnat_reg);
-	if (ret != 0)
-		goto unregister_snat;
-
-	return ret;
-
- unregister_snat:
-	xt_unregister_target(&ipt_snat_reg);
- unregister_table:
-	unregister_pernet_subsys(&nf_nat_rule_net_ops);
- out:
-	return ret;
-}
-
-void nf_nat_rule_cleanup(void)
-{
-	xt_unregister_target(&ipt_dnat_reg);
-	xt_unregister_target(&ipt_snat_reg);
-	unregister_pernet_subsys(&nf_nat_rule_net_ops);
-}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index df626af8413c..47a47186a791 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -19,7 +19,6 @@
 
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_sip.h>
@@ -255,15 +254,15 @@ static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
 static void ip_nat_sip_expected(struct nf_conn *ct,
 				struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_ipv4_range range;
+	struct nf_nat_range range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
 
 	/* For DST manip, map port here to where it's expected. */
 	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-	range.min = range.max = exp->saved_proto;
-	range.min_ip = range.max_ip = exp->saved_ip;
+	range.min_proto = range.max_proto = exp->saved_proto;
+	range.min_addr = range.max_addr = exp->saved_addr;
 	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
 
 	/* Change src to where master sends to, but only if the connection
@@ -271,8 +270,8 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
 	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
 	    ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
 		range.flags = NF_NAT_RANGE_MAP_IPS;
-		range.min_ip = range.max_ip
-			= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		range.min_addr = range.max_addr
+			= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
 		nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
 	}
 }
@@ -307,7 +306,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
 	else
 		port = ntohs(exp->tuple.dst.u.udp.port);
 
-	exp->saved_ip = exp->tuple.dst.u3.ip;
+	exp->saved_addr = exp->tuple.dst.u3;
 	exp->tuple.dst.u3.ip = newip;
 	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
 	exp->dir = !dir;
@@ -329,7 +328,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
 	if (port == 0)
 		return NF_DROP;
 
-	if (exp->tuple.dst.u3.ip != exp->saved_ip ||
+	if (exp->tuple.dst.u3.ip != exp->saved_addr.ip ||
 	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
 		buflen = sprintf(buffer, "%pI4:%u", &newip, port);
 		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -485,13 +484,13 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
 	else
 		rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
 
-	rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
+	rtp_exp->saved_addr = rtp_exp->tuple.dst.u3;
 	rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
 	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
 	rtp_exp->dir = !dir;
 	rtp_exp->expectfn = ip_nat_sip_expected;
 
-	rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
+	rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3;
 	rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
 	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
 	rtcp_exp->dir = !dir;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
deleted file mode 100644
index 3828a4229822..000000000000
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/icmp.h>
-#include <linux/gfp.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/spinlock.h>
-
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_extend.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-#ifdef CONFIG_XFRM
-static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
-{
-	struct flowi4 *fl4 = &fl->u.ip4;
-	const struct nf_conn *ct;
-	const struct nf_conntrack_tuple *t;
-	enum ip_conntrack_info ctinfo;
-	enum ip_conntrack_dir dir;
-	unsigned long statusbit;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (ct == NULL)
-		return;
-	dir = CTINFO2DIR(ctinfo);
-	t = &ct->tuplehash[dir].tuple;
-
-	if (dir == IP_CT_DIR_ORIGINAL)
-		statusbit = IPS_DST_NAT;
-	else
-		statusbit = IPS_SRC_NAT;
-
-	if (ct->status & statusbit) {
-		fl4->daddr = t->dst.u3.ip;
-		if (t->dst.protonum == IPPROTO_TCP ||
-		    t->dst.protonum == IPPROTO_UDP ||
-		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
-		    t->dst.protonum == IPPROTO_SCTP)
-			fl4->fl4_dport = t->dst.u.tcp.port;
-	}
-
-	statusbit ^= IPS_NAT_MASK;
-
-	if (ct->status & statusbit) {
-		fl4->saddr = t->src.u3.ip;
-		if (t->dst.protonum == IPPROTO_TCP ||
-		    t->dst.protonum == IPPROTO_UDP ||
-		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
-		    t->dst.protonum == IPPROTO_SCTP)
-			fl4->fl4_sport = t->src.u.tcp.port;
-	}
-}
-#endif
-
-static unsigned int
-nf_nat_fn(unsigned int hooknum,
-	  struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn_nat *nat;
-	/* maniptype == SRC for postrouting. */
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
-
-	/* We never see fragments: conntrack defrags on pre-routing
-	   and local-out, and nf_nat_out protects post-routing. */
-	NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-
-	ct = nf_ct_get(skb, &ctinfo);
-	/* Can't track?  It's not due to stress, or conntrack would
-	   have dropped it.  Hence it's the user's responsibilty to
-	   packet filter it out, or implement conntrack/NAT for that
-	   protocol. 8) --RR */
-	if (!ct)
-		return NF_ACCEPT;
-
-	/* Don't try to NAT if this packet is not conntracked */
-	if (nf_ct_is_untracked(ct))
-		return NF_ACCEPT;
-
-	nat = nfct_nat(ct);
-	if (!nat) {
-		/* NAT module was loaded late. */
-		if (nf_ct_is_confirmed(ct))
-			return NF_ACCEPT;
-		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-		if (nat == NULL) {
-			pr_debug("failed to add NAT extension\n");
-			return NF_ACCEPT;
-		}
-	}
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
-		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-			if (!nf_nat_icmp_reply_translation(ct, ctinfo,
-							   hooknum, skb))
-				return NF_DROP;
-			else
-				return NF_ACCEPT;
-		}
-		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
-	case IP_CT_NEW:
-
-		/* Seen it before?  This can happen for loopback, retrans,
-		   or local packets.. */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			unsigned int ret;
-
-			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
-			if (ret != NF_ACCEPT)
-				return ret;
-		} else
-			pr_debug("Already setup manip %s for ct %p\n",
-				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-				 ct);
-		break;
-
-	default:
-		/* ESTABLISHED */
-		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-			     ctinfo == IP_CT_ESTABLISHED_REPLY);
-	}
-
-	return nf_nat_packet(ct, ctinfo, hooknum, skb);
-}
-
-static unsigned int
-nf_nat_in(unsigned int hooknum,
-	  struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
-{
-	unsigned int ret;
-	__be32 daddr = ip_hdr(skb)->daddr;
-
-	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    daddr != ip_hdr(skb)->daddr)
-		skb_dst_drop(skb);
-
-	return ret;
-}
-
-static unsigned int
-nf_nat_out(unsigned int hooknum,
-	   struct sk_buff *skb,
-	   const struct net_device *in,
-	   const struct net_device *out,
-	   int (*okfn)(struct sk_buff *))
-{
-#ifdef CONFIG_XFRM
-	const struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-#endif
-	unsigned int ret;
-
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
-	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
-		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
-		    (ct->tuplehash[dir].tuple.src.u.all !=
-		     ct->tuplehash[!dir].tuple.dst.u.all)
-		   )
-			return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
-	}
-#endif
-	return ret;
-}
-
-static unsigned int
-nf_nat_local_fn(unsigned int hooknum,
-		struct sk_buff *skb,
-		const struct net_device *in,
-		const struct net_device *out,
-		int (*okfn)(struct sk_buff *))
-{
-	const struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	unsigned int ret;
-
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
-	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
-		    ct->tuplehash[!dir].tuple.src.u3.ip) {
-			if (ip_route_me_harder(skb, RTN_UNSPEC))
-				ret = NF_DROP;
-		}
-#ifdef CONFIG_XFRM
-		else if (ct->tuplehash[dir].tuple.dst.u.all !=
-			 ct->tuplehash[!dir].tuple.src.u.all)
-			if (ip_xfrm_me_harder(skb))
-				ret = NF_DROP;
-#endif
-	}
-	return ret;
-}
-
-/* We must be after connection tracking and before packet filtering. */
-
-static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
-	/* Before packet filtering, change destination */
-	{
-		.hook		= nf_nat_in,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_NAT_DST,
-	},
-	/* After packet filtering, change source */
-	{
-		.hook		= nf_nat_out,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_NAT_SRC,
-	},
-	/* Before packet filtering, change destination */
-	{
-		.hook		= nf_nat_local_fn,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_NAT_DST,
-	},
-	/* After packet filtering, change source */
-	{
-		.hook		= nf_nat_fn,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_NAT_SRC,
-	},
-};
-
-static int __init nf_nat_standalone_init(void)
-{
-	int ret = 0;
-
-	need_ipv4_conntrack();
-
-#ifdef CONFIG_XFRM
-	BUG_ON(ip_nat_decode_session != NULL);
-	RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session);
-#endif
-	ret = nf_nat_rule_init();
-	if (ret < 0) {
-		pr_err("nf_nat_init: can't setup rules.\n");
-		goto cleanup_decode_session;
-	}
-	ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
-	if (ret < 0) {
-		pr_err("nf_nat_init: can't register hooks.\n");
-		goto cleanup_rule_init;
-	}
-	return ret;
-
- cleanup_rule_init:
-	nf_nat_rule_cleanup();
- cleanup_decode_session:
-#ifdef CONFIG_XFRM
-	RCU_INIT_POINTER(ip_nat_decode_session, NULL);
-	synchronize_net();
-#endif
-	return ret;
-}
-
-static void __exit nf_nat_standalone_fini(void)
-{
-	nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
-	nf_nat_rule_cleanup();
-#ifdef CONFIG_XFRM
-	RCU_INIT_POINTER(ip_nat_decode_session, NULL);
-	synchronize_net();
-#endif
-	/* Conntrack caches are unregistered in nf_conntrack_cleanup */
-}
-
-module_init(nf_nat_standalone_init);
-module_exit(nf_nat_standalone_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index 9dbb8d284f99..ccabbda71a3e 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -11,7 +11,6 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_tftp.h>
 
 MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-- 
cgit v1.2.1


From b3f644fc8232ca761da0b5c5ccb6f30b423c4302 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:14 +0200
Subject: netfilter: ip6tables: add MASQUERADE target

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_MASQUERADE.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 1c3aa28b51ae..5d5d4d1be9c2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -99,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
 
 	if (!nat)
 		return 0;
-
+	if (nf_ct_l3num(i) != NFPROTO_IPV4)
+		return 0;
 	return nat->masq_index == (int)(long)ifindex;
 }
 
-- 
cgit v1.2.1


From d33cbeeb1a46a7dc82fe9f53e40a742ce0c67c79 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:20 +0200
Subject: netfilter: nf_nat: support IPv6 in FTP NAT helper

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig      |   5 --
 net/ipv4/netfilter/Makefile     |   1 -
 net/ipv4/netfilter/nf_nat_ftp.c | 137 ----------------------------------------
 3 files changed, 143 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_nat_ftp.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index b26629681bdb..8c6c6920b57a 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -221,11 +221,6 @@ config NF_NAT_PROTO_GRE
 	tristate
 	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
 
-config NF_NAT_FTP
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_FTP
-
 config NF_NAT_IRC
 	tristate
 	depends on NF_CONNTRACK && NF_NAT_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0ea3acc510e2..4d8a4ad7816e 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 # NAT helpers (nf_conntrack)
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
-obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
deleted file mode 100644
index dd5e387fc03b..000000000000
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/* FTP extension for TCP NAT alteration. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_ftp.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("ftp NAT helper");
-MODULE_ALIAS("ip_nat_ftp");
-
-/* FIXME: Time out? --RR */
-
-static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
-			      char *buffer, size_t buflen,
-			      __be32 addr, u16 port)
-{
-	switch (type) {
-	case NF_CT_FTP_PORT:
-	case NF_CT_FTP_PASV:
-		return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
-				((unsigned char *)&addr)[0],
-				((unsigned char *)&addr)[1],
-				((unsigned char *)&addr)[2],
-				((unsigned char *)&addr)[3],
-				port >> 8,
-				port & 0xFF);
-	case NF_CT_FTP_EPRT:
-		return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
-	case NF_CT_FTP_EPSV:
-		return snprintf(buffer, buflen, "|||%u|", port);
-	}
-
-	return 0;
-}
-
-/* So, this packet has hit the connection tracking matching code.
-   Mangle it, and change the expectation to match the new version. */
-static unsigned int nf_nat_ftp(struct sk_buff *skb,
-			       enum ip_conntrack_info ctinfo,
-			       enum nf_ct_ftp_type type,
-			       unsigned int protoff,
-			       unsigned int matchoff,
-			       unsigned int matchlen,
-			       struct nf_conntrack_expect *exp)
-{
-	__be32 newip;
-	u_int16_t port;
-	int dir = CTINFO2DIR(ctinfo);
-	struct nf_conn *ct = exp->master;
-	char buffer[sizeof("|1|255.255.255.255|65535|")];
-	unsigned int buflen;
-
-	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
-
-	/* Connection will come from wherever this packet goes, hence !dir */
-	newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-	exp->dir = !dir;
-
-	/* When you see the packet, we need to NAT it the same as the
-	 * this one. */
-	exp->expectfn = nf_nat_follow_master;
-
-	/* Try to get same port: if not, try to change it. */
-	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-		int ret;
-
-		exp->tuple.dst.u.tcp.port = htons(port);
-		ret = nf_ct_expect_related(exp);
-		if (ret == 0)
-			break;
-		else if (ret != -EBUSY) {
-			port = 0;
-			break;
-		}
-	}
-
-	if (port == 0)
-		return NF_DROP;
-
-	buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
-	if (!buflen)
-		goto out;
-
-	pr_debug("calling nf_nat_mangle_tcp_packet\n");
-
-	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
-				      matchlen, buffer, buflen))
-		goto out;
-
-	return NF_ACCEPT;
-
-out:
-	nf_ct_unexpect_related(exp);
-	return NF_DROP;
-}
-
-static void __exit nf_nat_ftp_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
-	synchronize_rcu();
-}
-
-static int __init nf_nat_ftp_init(void)
-{
-	BUG_ON(nf_nat_ftp_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
-	return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
-	printk(KERN_INFO KBUILD_MODNAME
-	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
-	return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(nf_nat_ftp_init);
-module_exit(nf_nat_ftp_fini);
-- 
cgit v1.2.1


From ee6eb96673704225164f0ba7462e1973ce10885c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:22 +0200
Subject: netfilter: nf_nat: support IPv6 in amanda NAT helper

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig         |  5 ---
 net/ipv4/netfilter/Makefile        |  1 -
 net/ipv4/netfilter/nf_nat_amanda.c | 85 --------------------------------------
 3 files changed, 91 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_nat_amanda.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 8c6c6920b57a..52c4a87007aa 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -231,11 +231,6 @@ config NF_NAT_TFTP
 	depends on NF_CONNTRACK && NF_NAT_IPV4
 	default NF_NAT_IPV4 && NF_CONNTRACK_TFTP
 
-config NF_NAT_AMANDA
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_AMANDA
-
 config NF_NAT_PPTP
 	tristate
 	depends on NF_CONNTRACK && NF_NAT_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 4d8a4ad7816e..8baa496f6a4e 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 # NAT helpers (nf_conntrack)
-obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
deleted file mode 100644
index 42d337881171..000000000000
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Amanda extension for TCP NAT alteration.
- * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
- * based on a copy of HW's ip_nat_irc.c as well as other modules
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/udp.h>
-
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <linux/netfilter/nf_conntrack_amanda.h>
-
-MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
-MODULE_DESCRIPTION("Amanda NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_amanda");
-
-static unsigned int help(struct sk_buff *skb,
-			 enum ip_conntrack_info ctinfo,
-			 unsigned int protoff,
-			 unsigned int matchoff,
-			 unsigned int matchlen,
-			 struct nf_conntrack_expect *exp)
-{
-	char buffer[sizeof("65535")];
-	u_int16_t port;
-	unsigned int ret;
-
-	/* Connection comes from client. */
-	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-	exp->dir = IP_CT_DIR_ORIGINAL;
-
-	/* When you see the packet, we need to NAT it the same as the
-	 * this one (ie. same IP: it will be TCP and master is UDP). */
-	exp->expectfn = nf_nat_follow_master;
-
-	/* Try to get same port: if not, try to change it. */
-	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-		int res;
-
-		exp->tuple.dst.u.tcp.port = htons(port);
-		res = nf_ct_expect_related(exp);
-		if (res == 0)
-			break;
-		else if (res != -EBUSY) {
-			port = 0;
-			break;
-		}
-	}
-
-	if (port == 0)
-		return NF_DROP;
-
-	sprintf(buffer, "%u", port);
-	ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
-				       protoff, matchoff, matchlen,
-				       buffer, strlen(buffer));
-	if (ret != NF_ACCEPT)
-		nf_ct_unexpect_related(exp);
-	return ret;
-}
-
-static void __exit nf_nat_amanda_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
-	synchronize_rcu();
-}
-
-static int __init nf_nat_amanda_init(void)
-{
-	BUG_ON(nf_nat_amanda_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_amanda_hook, help);
-	return 0;
-}
-
-module_init(nf_nat_amanda_init);
-module_exit(nf_nat_amanda_fini);
-- 
cgit v1.2.1


From 9a664821068739dbc8eac13770e28167b46a0c0f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 26 Aug 2012 19:14:25 +0200
Subject: netfilter: nf_nat: support IPv6 in SIP NAT helper

Add IPv6 support to the SIP NAT helper. There are no functional differences
to IPv4 NAT, just different formats for addresses.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig      |   5 -
 net/ipv4/netfilter/Makefile     |   1 -
 net/ipv4/netfilter/nf_nat_sip.c | 580 ----------------------------------------
 3 files changed, 586 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_nat_sip.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 52c4a87007aa..30197f8003be 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -242,11 +242,6 @@ config NF_NAT_H323
 	depends on NF_CONNTRACK && NF_NAT_IPV4
 	default NF_NAT_IPV4 && NF_CONNTRACK_H323
 
-config NF_NAT_SIP
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_SIP
-
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8baa496f6a4e..8914abffc96d 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
deleted file mode 100644
index 47a47186a791..000000000000
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ /dev/null
@@ -1,580 +0,0 @@
-/* SIP extension for NAT alteration.
- *
- * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
- * based on RR's ip_nat_ftp.c and other modules.
- * (C) 2007 United Security Providers
- * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_sip.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
-MODULE_DESCRIPTION("SIP NAT helper");
-MODULE_ALIAS("ip_nat_sip");
-
-
-static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
-				  unsigned int dataoff,
-				  const char **dptr, unsigned int *datalen,
-				  unsigned int matchoff, unsigned int matchlen,
-				  const char *buffer, unsigned int buflen)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct tcphdr *th;
-	unsigned int baseoff;
-
-	if (nf_ct_protonum(ct) == IPPROTO_TCP) {
-		th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
-		baseoff = ip_hdrlen(skb) + th->doff * 4;
-		matchoff += dataoff - baseoff;
-
-		if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-						protoff, matchoff, matchlen,
-						buffer, buflen, false))
-			return 0;
-	} else {
-		baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
-		matchoff += dataoff - baseoff;
-
-		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-					      protoff, matchoff, matchlen,
-					      buffer, buflen))
-			return 0;
-	}
-
-	/* Reload data pointer and adjust datalen value */
-	*dptr = skb->data + dataoff;
-	*datalen += buflen - matchlen;
-	return 1;
-}
-
-static int map_addr(struct sk_buff *skb, unsigned int protoff,
-		    unsigned int dataoff,
-		    const char **dptr, unsigned int *datalen,
-		    unsigned int matchoff, unsigned int matchlen,
-		    union nf_inet_addr *addr, __be16 port)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-	unsigned int buflen;
-	__be32 newaddr;
-	__be16 newport;
-
-	if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
-	    ct->tuplehash[dir].tuple.src.u.udp.port == port) {
-		newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
-		newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
-	} else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
-		   ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
-		newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
-		newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
-	} else
-		return 1;
-
-	if (newaddr == addr->ip && newport == port)
-		return 1;
-
-	buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
-
-	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
-			     matchoff, matchlen, buffer, buflen);
-}
-
-static int map_sip_addr(struct sk_buff *skb, unsigned int protoff,
-			unsigned int dataoff,
-			const char **dptr, unsigned int *datalen,
-			enum sip_header_types type)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	unsigned int matchlen, matchoff;
-	union nf_inet_addr addr;
-	__be16 port;
-
-	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
-				    &matchoff, &matchlen, &addr, &port) <= 0)
-		return 1;
-	return map_addr(skb, protoff, dataoff, dptr, datalen,
-			matchoff, matchlen, &addr, port);
-}
-
-static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int protoff,
-			       unsigned int dataoff,
-			       const char **dptr, unsigned int *datalen)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	unsigned int coff, matchoff, matchlen;
-	enum sip_header_types hdr;
-	union nf_inet_addr addr;
-	__be16 port;
-	int request, in_header;
-
-	/* Basic rules: requests and responses. */
-	if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
-		if (ct_sip_parse_request(ct, *dptr, *datalen,
-					 &matchoff, &matchlen,
-					 &addr, &port) > 0 &&
-		    !map_addr(skb, protoff, dataoff, dptr, datalen,
-			      matchoff, matchlen, &addr, port))
-			return NF_DROP;
-		request = 1;
-	} else
-		request = 0;
-
-	if (nf_ct_protonum(ct) == IPPROTO_TCP)
-		hdr = SIP_HDR_VIA_TCP;
-	else
-		hdr = SIP_HDR_VIA_UDP;
-
-	/* Translate topmost Via header and parameters */
-	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
-				    hdr, NULL, &matchoff, &matchlen,
-				    &addr, &port) > 0) {
-		unsigned int olen, matchend, poff, plen, buflen, n;
-		char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-
-		/* We're only interested in headers related to this
-		 * connection */
-		if (request) {
-			if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
-			    port != ct->tuplehash[dir].tuple.src.u.udp.port)
-				goto next;
-		} else {
-			if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
-			    port != ct->tuplehash[dir].tuple.dst.u.udp.port)
-				goto next;
-		}
-
-		olen = *datalen;
-		if (!map_addr(skb, protoff, dataoff, dptr, datalen,
-			      matchoff, matchlen, &addr, port))
-			return NF_DROP;
-
-		matchend = matchoff + matchlen + *datalen - olen;
-
-		/* The maddr= parameter (RFC 2361) specifies where to send
-		 * the reply. */
-		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
-					       "maddr=", &poff, &plen,
-					       &addr, true) > 0 &&
-		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
-		    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
-			buflen = sprintf(buffer, "%pI4",
-					&ct->tuplehash[!dir].tuple.dst.u3.ip);
-			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-					   poff, plen, buffer, buflen))
-				return NF_DROP;
-		}
-
-		/* The received= parameter (RFC 2361) contains the address
-		 * from which the server received the request. */
-		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
-					       "received=", &poff, &plen,
-					       &addr, false) > 0 &&
-		    addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
-		    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
-			buflen = sprintf(buffer, "%pI4",
-					&ct->tuplehash[!dir].tuple.src.u3.ip);
-			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-					   poff, plen, buffer, buflen))
-				return NF_DROP;
-		}
-
-		/* The rport= parameter (RFC 3581) contains the port number
-		 * from which the server received the request. */
-		if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
-						 "rport=", &poff, &plen,
-						 &n) > 0 &&
-		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
-		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
-			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
-			buflen = sprintf(buffer, "%u", ntohs(p));
-			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-					   poff, plen, buffer, buflen))
-				return NF_DROP;
-		}
-	}
-
-next:
-	/* Translate Contact headers */
-	coff = 0;
-	in_header = 0;
-	while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
-				       SIP_HDR_CONTACT, &in_header,
-				       &matchoff, &matchlen,
-				       &addr, &port) > 0) {
-		if (!map_addr(skb, protoff, dataoff, dptr, datalen,
-			      matchoff, matchlen,
-			      &addr, port))
-			return NF_DROP;
-	}
-
-	if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) ||
-	    !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO))
-		return NF_DROP;
-
-	return NF_ACCEPT;
-}
-
-static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	const struct tcphdr *th;
-
-	if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
-		return;
-
-	th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-
-/* Handles expected signalling connections and media streams */
-static void ip_nat_sip_expected(struct nf_conn *ct,
-				struct nf_conntrack_expect *exp)
-{
-	struct nf_nat_range range;
-
-	/* This must be a fresh one. */
-	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
-	/* For DST manip, map port here to where it's expected. */
-	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
-	range.min_proto = range.max_proto = exp->saved_proto;
-	range.min_addr = range.max_addr = exp->saved_addr;
-	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
-
-	/* Change src to where master sends to, but only if the connection
-	 * actually came from the same source. */
-	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
-	    ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
-		range.flags = NF_NAT_RANGE_MAP_IPS;
-		range.min_addr = range.max_addr
-			= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
-		nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
-	}
-}
-
-static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
-				      unsigned int dataoff,
-				      const char **dptr, unsigned int *datalen,
-				      struct nf_conntrack_expect *exp,
-				      unsigned int matchoff,
-				      unsigned int matchlen)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	__be32 newip;
-	u_int16_t port;
-	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-	unsigned int buflen;
-
-	/* Connection will come from reply */
-	if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
-		newip = exp->tuple.dst.u3.ip;
-	else
-		newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-
-	/* If the signalling port matches the connection's source port in the
-	 * original direction, try to use the destination port in the opposite
-	 * direction. */
-	if (exp->tuple.dst.u.udp.port ==
-	    ct->tuplehash[dir].tuple.src.u.udp.port)
-		port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
-	else
-		port = ntohs(exp->tuple.dst.u.udp.port);
-
-	exp->saved_addr = exp->tuple.dst.u3;
-	exp->tuple.dst.u3.ip = newip;
-	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
-	exp->dir = !dir;
-	exp->expectfn = ip_nat_sip_expected;
-
-	for (; port != 0; port++) {
-		int ret;
-
-		exp->tuple.dst.u.udp.port = htons(port);
-		ret = nf_ct_expect_related(exp);
-		if (ret == 0)
-			break;
-		else if (ret != -EBUSY) {
-			port = 0;
-			break;
-		}
-	}
-
-	if (port == 0)
-		return NF_DROP;
-
-	if (exp->tuple.dst.u3.ip != exp->saved_addr.ip ||
-	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
-		buflen = sprintf(buffer, "%pI4:%u", &newip, port);
-		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-				   matchoff, matchlen, buffer, buflen))
-			goto err;
-	}
-	return NF_ACCEPT;
-
-err:
-	nf_ct_unexpect_related(exp);
-	return NF_DROP;
-}
-
-static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
-			      unsigned int dataoff,
-			      const char **dptr, unsigned int *datalen)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	unsigned int matchoff, matchlen;
-	char buffer[sizeof("65536")];
-	int buflen, c_len;
-
-	/* Get actual SDP length */
-	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
-				  SDP_HDR_VERSION, SDP_HDR_UNSPEC,
-				  &matchoff, &matchlen) <= 0)
-		return 0;
-	c_len = *datalen - matchoff + strlen("v=");
-
-	/* Now, update SDP length */
-	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
-			      &matchoff, &matchlen) <= 0)
-		return 0;
-
-	buflen = sprintf(buffer, "%u", c_len);
-	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
-			     matchoff, matchlen, buffer, buflen);
-}
-
-static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff,
-			     unsigned int dataoff,
-			     const char **dptr, unsigned int *datalen,
-			     unsigned int sdpoff,
-			     enum sdp_header_types type,
-			     enum sdp_header_types term,
-			     char *buffer, int buflen)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	unsigned int matchlen, matchoff;
-
-	if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
-				  &matchoff, &matchlen) <= 0)
-		return -ENOENT;
-	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
-			     matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL;
-}
-
-static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
-				    unsigned int dataoff,
-				    const char **dptr, unsigned int *datalen,
-				    unsigned int sdpoff,
-				    enum sdp_header_types type,
-				    enum sdp_header_types term,
-				    const union nf_inet_addr *addr)
-{
-	char buffer[sizeof("nnn.nnn.nnn.nnn")];
-	unsigned int buflen;
-
-	buflen = sprintf(buffer, "%pI4", &addr->ip);
-	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
-			      sdpoff, type, term, buffer, buflen))
-		return 0;
-
-	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
-}
-
-static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
-				    unsigned int dataoff,
-				    const char **dptr, unsigned int *datalen,
-				    unsigned int matchoff,
-				    unsigned int matchlen,
-				    u_int16_t port)
-{
-	char buffer[sizeof("nnnnn")];
-	unsigned int buflen;
-
-	buflen = sprintf(buffer, "%u", port);
-	if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-			   matchoff, matchlen, buffer, buflen))
-		return 0;
-
-	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
-}
-
-static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int protoff,
-				       unsigned int dataoff,
-				       const char **dptr, unsigned int *datalen,
-				       unsigned int sdpoff,
-				       const union nf_inet_addr *addr)
-{
-	char buffer[sizeof("nnn.nnn.nnn.nnn")];
-	unsigned int buflen;
-
-	/* Mangle session description owner and contact addresses */
-	buflen = sprintf(buffer, "%pI4", &addr->ip);
-	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
-			       SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
-			       buffer, buflen))
-		return 0;
-
-	switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
-				  SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
-				  buffer, buflen)) {
-	case 0:
-	/*
-	 * RFC 2327:
-	 *
-	 * Session description
-	 *
-	 * c=* (connection information - not required if included in all media)
-	 */
-	case -ENOENT:
-		break;
-	default:
-		return 0;
-	}
-
-	return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
-}
-
-/* So, this packet has hit the connection tracking matching code.
-   Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
-				     unsigned int dataoff,
-				     const char **dptr, unsigned int *datalen,
-				     struct nf_conntrack_expect *rtp_exp,
-				     struct nf_conntrack_expect *rtcp_exp,
-				     unsigned int mediaoff,
-				     unsigned int medialen,
-				     union nf_inet_addr *rtp_addr)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	u_int16_t port;
-
-	/* Connection will come from reply */
-	if (ct->tuplehash[dir].tuple.src.u3.ip ==
-	    ct->tuplehash[!dir].tuple.dst.u3.ip)
-		rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
-	else
-		rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-
-	rtp_exp->saved_addr = rtp_exp->tuple.dst.u3;
-	rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
-	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
-	rtp_exp->dir = !dir;
-	rtp_exp->expectfn = ip_nat_sip_expected;
-
-	rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3;
-	rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
-	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
-	rtcp_exp->dir = !dir;
-	rtcp_exp->expectfn = ip_nat_sip_expected;
-
-	/* Try to get same pair of ports: if not, try to change them. */
-	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
-	     port != 0; port += 2) {
-		int ret;
-
-		rtp_exp->tuple.dst.u.udp.port = htons(port);
-		ret = nf_ct_expect_related(rtp_exp);
-		if (ret == -EBUSY)
-			continue;
-		else if (ret < 0) {
-			port = 0;
-			break;
-		}
-		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-		ret = nf_ct_expect_related(rtcp_exp);
-		if (ret == 0)
-			break;
-		else if (ret != -EBUSY) {
-			nf_ct_unexpect_related(rtp_exp);
-			port = 0;
-			break;
-		}
-	}
-
-	if (port == 0)
-		goto err1;
-
-	/* Update media port. */
-	if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
-	    !ip_nat_sdp_port(skb, protoff, dataoff, dptr, datalen,
-			     mediaoff, medialen, port))
-		goto err2;
-
-	return NF_ACCEPT;
-
-err2:
-	nf_ct_unexpect_related(rtp_exp);
-	nf_ct_unexpect_related(rtcp_exp);
-err1:
-	return NF_DROP;
-}
-
-static struct nf_ct_helper_expectfn sip_nat = {
-        .name           = "sip",
-        .expectfn       = ip_nat_sip_expected,
-};
-
-static void __exit nf_nat_sip_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_sip_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL);
-	RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL);
-	nf_ct_helper_expectfn_unregister(&sip_nat);
-	synchronize_rcu();
-}
-
-static int __init nf_nat_sip_init(void)
-{
-	BUG_ON(nf_nat_sip_hook != NULL);
-	BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
-	BUG_ON(nf_nat_sip_expect_hook != NULL);
-	BUG_ON(nf_nat_sdp_addr_hook != NULL);
-	BUG_ON(nf_nat_sdp_port_hook != NULL);
-	BUG_ON(nf_nat_sdp_session_hook != NULL);
-	BUG_ON(nf_nat_sdp_media_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip);
-	RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
-	RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect);
-	RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
-	RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port);
-	RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session);
-	RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media);
-	nf_ct_helper_expectfn_register(&sip_nat);
-	return 0;
-}
-
-module_init(nf_nat_sip_init);
-module_exit(nf_nat_sip_fini);
-- 
cgit v1.2.1


From 5901b6be885e2c9a30fd94803b846b3d33e351dd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 26 Aug 2012 19:14:27 +0200
Subject: netfilter: nf_nat: support IPv6 in IRC NAT helper

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig      |  5 ---
 net/ipv4/netfilter/Makefile     |  1 -
 net/ipv4/netfilter/nf_nat_irc.c | 99 -----------------------------------------
 3 files changed, 105 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_nat_irc.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 30197f8003be..843fe17db20d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -221,11 +221,6 @@ config NF_NAT_PROTO_GRE
 	tristate
 	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
 
-config NF_NAT_IRC
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_IRC
-
 config NF_NAT_TFTP
 	tristate
 	depends on NF_CONNTRACK && NF_NAT_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8914abffc96d..17e649bb98ba 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 # NAT helpers (nf_conntrack)
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
-obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
deleted file mode 100644
index 1ce37f89ec78..000000000000
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* IRC extension for TCP NAT alteration.
- *
- * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
- * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- * based on a copy of RR's ip_nat_ftp.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/tcp.h>
-#include <linux/kernel.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_irc.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("IRC (DCC) NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_irc");
-
-static unsigned int help(struct sk_buff *skb,
-			 enum ip_conntrack_info ctinfo,
-			 unsigned int protoff,
-			 unsigned int matchoff,
-			 unsigned int matchlen,
-			 struct nf_conntrack_expect *exp)
-{
-	char buffer[sizeof("4294967296 65635")];
-	u_int32_t ip;
-	u_int16_t port;
-	unsigned int ret;
-
-	/* Reply comes from server. */
-	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
-	exp->dir = IP_CT_DIR_REPLY;
-	exp->expectfn = nf_nat_follow_master;
-
-	/* Try to get same port: if not, try to change it. */
-	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
-		int ret;
-
-		exp->tuple.dst.u.tcp.port = htons(port);
-		ret = nf_ct_expect_related(exp);
-		if (ret == 0)
-			break;
-		else if (ret != -EBUSY) {
-			port = 0;
-			break;
-		}
-	}
-
-	if (port == 0)
-		return NF_DROP;
-
-	ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
-	sprintf(buffer, "%u %u", ip, port);
-	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
-		 buffer, &ip, port);
-
-	ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
-				       protoff, matchoff, matchlen, buffer,
-				       strlen(buffer));
-	if (ret != NF_ACCEPT)
-		nf_ct_unexpect_related(exp);
-	return ret;
-}
-
-static void __exit nf_nat_irc_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
-	synchronize_rcu();
-}
-
-static int __init nf_nat_irc_init(void)
-{
-	BUG_ON(nf_nat_irc_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_irc_hook, help);
-	return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
-	printk(KERN_INFO KBUILD_MODNAME
-	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
-	return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(nf_nat_irc_init);
-module_exit(nf_nat_irc_fini);
-- 
cgit v1.2.1


From 320ff567f299ed3f0a2d53906e632a1b0eda5599 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 26 Aug 2012 19:14:29 +0200
Subject: netfilter: nf_nat: support IPv6 in TFTP NAT helper

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig       |  5 ----
 net/ipv4/netfilter/Makefile      |  1 -
 net/ipv4/netfilter/nf_nat_tftp.c | 50 ----------------------------------------
 3 files changed, 56 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_nat_tftp.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 843fe17db20d..131e53702e77 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -221,11 +221,6 @@ config NF_NAT_PROTO_GRE
 	tristate
 	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
 
-config NF_NAT_TFTP
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_TFTP
-
 config NF_NAT_PPTP
 	tristate
 	depends on NF_CONNTRACK && NF_NAT_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 17e649bb98ba..b7dd18987237 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
deleted file mode 100644
index ccabbda71a3e..000000000000
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/udp.h>
-
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <linux/netfilter/nf_conntrack_tftp.h>
-
-MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-MODULE_DESCRIPTION("TFTP NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_tftp");
-
-static unsigned int help(struct sk_buff *skb,
-			 enum ip_conntrack_info ctinfo,
-			 struct nf_conntrack_expect *exp)
-{
-	const struct nf_conn *ct = exp->master;
-
-	exp->saved_proto.udp.port
-		= ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
-	exp->dir = IP_CT_DIR_REPLY;
-	exp->expectfn = nf_nat_follow_master;
-	if (nf_ct_expect_related(exp) != 0)
-		return NF_DROP;
-	return NF_ACCEPT;
-}
-
-static void __exit nf_nat_tftp_fini(void)
-{
-	RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
-	synchronize_rcu();
-}
-
-static int __init nf_nat_tftp_init(void)
-{
-	BUG_ON(nf_nat_tftp_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_tftp_hook, help);
-	return 0;
-}
-
-module_init(nf_nat_tftp_init);
-module_exit(nf_nat_tftp_fini);
-- 
cgit v1.2.1


From 3f509c689a07a4aa989b426893d8491a7ffcc410 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 Aug 2012 15:24:09 +0000
Subject: netfilter: nf_nat_sip: fix incorrect handling of EBUSY for RTCP
 expectation

We're hitting bug while trying to reinsert an already existing
expectation:

kernel BUG at kernel/timer.c:895!
invalid opcode: 0000 [#1] SMP
[...]
Call Trace:
 <IRQ>
 [<ffffffffa0069563>] nf_ct_expect_related_report+0x4a0/0x57a [nf_conntrack]
 [<ffffffff812d423a>] ? in4_pton+0x72/0x131
 [<ffffffffa00ca69e>] ip_nat_sdp_media+0xeb/0x185 [nf_nat_sip]
 [<ffffffffa00b5b9b>] set_expected_rtp_rtcp+0x32d/0x39b [nf_conntrack_sip]
 [<ffffffffa00b5f15>] process_sdp+0x30c/0x3ec [nf_conntrack_sip]
 [<ffffffff8103f1eb>] ? irq_exit+0x9a/0x9c
 [<ffffffffa00ca738>] ? ip_nat_sdp_media+0x185/0x185 [nf_nat_sip]

We have to remove the RTP expectation if the RTCP expectation hits EBUSY
since we keep trying with other ports until we succeed.

Reported-by: Rafal Fitt <rafalf@aplusc.com.pl>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_sip.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 4ad9cf173992..9c87cde28ff8 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -502,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
 		ret = nf_ct_expect_related(rtcp_exp);
 		if (ret == 0)
 			break;
-		else if (ret != -EBUSY) {
+		else if (ret == -EBUSY) {
+			nf_ct_unexpect_related(rtp_exp);
+			continue;
+		} else if (ret < 0) {
 			nf_ct_unexpect_related(rtp_exp);
 			port = 0;
 			break;
-- 
cgit v1.2.1


From acbb219d5f53821b2d0080d047800410c0420ea1 Mon Sep 17 00:00:00 2001
From: Francesco Ruggeri <fruggeri@aristanetworks.com>
Date: Fri, 24 Aug 2012 07:38:35 +0000
Subject: net: ipv4: ipmr_expire_timer causes crash when removing net namespace

When tearing down a net namespace, ipv4 mr_table structures are freed
without first deactivating their timers. This can result in a crash in
run_timer_softirq.
This patch mimics the corresponding behaviour in ipv6.
Locking and synchronization seem to be adequate.
We are about to kfree mrt, so existing code should already make sure that
no other references to mrt are pending or can be created by incoming traffic.
The functions invoked here do not cause new references to mrt or other
race conditions to be created.
Invoking del_timer_sync guarantees that ipmr_expire_timer is inactive.
Both ipmr_expire_process (whose completion we may have to wait in
del_timer_sync) and mroute_clean_tables internally use mfc_unres_lock
or other synchronizations when needed, and they both only modify mrt.

Tested in Linux 3.4.8.

Signed-off-by: Francesco Ruggeri <fruggeri@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8eec8f4a0536..ebdf06f938bf 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
+
 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 			 struct sk_buff *skb, struct mfc_cache *cache,
 			 int local);
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			      struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_clean_tables(struct mr_table *mrt);
 static void ipmr_expire_process(unsigned long arg);
 
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 
 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
 		list_del(&mrt->list);
-		kfree(mrt);
+		ipmr_free_table(mrt);
 	}
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
 }
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net)
 
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
-	kfree(net->ipv4.mrt);
+	ipmr_free_table(net->ipv4.mrt);
 }
 #endif
 
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	return mrt;
 }
 
+static void ipmr_free_table(struct mr_table *mrt)
+{
+	del_timer_sync(&mrt->ipmr_expire_timer);
+	mroute_clean_tables(mrt);
+	kfree(mrt);
+}
+
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
-- 
cgit v1.2.1


From c5ae7d41927dbd208c885d4794e30617ad6cdf12 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Aug 2012 12:33:07 +0000
Subject: ipv4: must use rcu protection while calling fib_lookup

Following lockdep splat was reported by Pavel Roskin :

[ 1570.586223] ===============================
[ 1570.586225] [ INFO: suspicious RCU usage. ]
[ 1570.586228] 3.6.0-rc3-wl-main #98 Not tainted
[ 1570.586229] -------------------------------
[ 1570.586231] /home/proski/src/linux/net/ipv4/route.c:645 suspicious rcu_dereference_check() usage!
[ 1570.586233]
[ 1570.586233] other info that might help us debug this:
[ 1570.586233]
[ 1570.586236]
[ 1570.586236] rcu_scheduler_active = 1, debug_locks = 0
[ 1570.586238] 2 locks held by Chrome_IOThread/4467:
[ 1570.586240]  #0:  (slock-AF_INET){+.-...}, at: [<ffffffff814f2c0c>] release_sock+0x2c/0xa0
[ 1570.586253]  #1:  (fnhe_lock){+.-...}, at: [<ffffffff815302fc>] update_or_create_fnhe+0x2c/0x270
[ 1570.586260]
[ 1570.586260] stack backtrace:
[ 1570.586263] Pid: 4467, comm: Chrome_IOThread Not tainted 3.6.0-rc3-wl-main #98
[ 1570.586265] Call Trace:
[ 1570.586271]  [<ffffffff810976ed>] lockdep_rcu_suspicious+0xfd/0x130
[ 1570.586275]  [<ffffffff8153042c>] update_or_create_fnhe+0x15c/0x270
[ 1570.586278]  [<ffffffff815305b3>] __ip_rt_update_pmtu+0x73/0xb0
[ 1570.586282]  [<ffffffff81530619>] ip_rt_update_pmtu+0x29/0x90
[ 1570.586285]  [<ffffffff815411dc>] inet_csk_update_pmtu+0x2c/0x80
[ 1570.586290]  [<ffffffff81558d1e>] tcp_v4_mtu_reduced+0x2e/0xc0
[ 1570.586293]  [<ffffffff81553bc4>] tcp_release_cb+0xa4/0xb0
[ 1570.586296]  [<ffffffff814f2c35>] release_sock+0x55/0xa0
[ 1570.586300]  [<ffffffff815442ef>] tcp_sendmsg+0x4af/0xf50
[ 1570.586305]  [<ffffffff8156fc60>] inet_sendmsg+0x120/0x230
[ 1570.586308]  [<ffffffff8156fb40>] ? inet_sk_rebuild_header+0x40/0x40
[ 1570.586312]  [<ffffffff814f4bdd>] ? sock_update_classid+0xbd/0x3b0
[ 1570.586315]  [<ffffffff814f4c50>] ? sock_update_classid+0x130/0x3b0
[ 1570.586320]  [<ffffffff814ec435>] do_sock_write+0xc5/0xe0
[ 1570.586323]  [<ffffffff814ec4a3>] sock_aio_write+0x53/0x80
[ 1570.586328]  [<ffffffff8114bc83>] do_sync_write+0xa3/0xe0
[ 1570.586332]  [<ffffffff8114c5a5>] vfs_write+0x165/0x180
[ 1570.586335]  [<ffffffff8114c805>] sys_write+0x45/0x90
[ 1570.586340]  [<ffffffff815d2722>] system_call_fastpath+0x16/0x1b

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 24fd4c596643..82cf2a722b23 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
+	rcu_read_lock();
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 				      jiffies + ip_rt_mtu_expires);
 	}
+	rcu_read_unlock();
 	return mtu;
 }
 
-- 
cgit v1.2.1


From 98d75c3724f9b37cdfdb85f821e10edcb743959e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 27 Aug 2012 06:30:01 +0000
Subject: ipv4: Minor logic clean-up in ipv4_mtu

In ipv4_mtu there is some logic where we are testing for a non-zero value
and a timer expiration, then setting the value to zero, and then testing if
the value is zero we set it to a value based on the dst.  Instead of
bothering with the extra steps it is easier to just cleanup the logic so
that we set it to the dst based value if it is zero or if the timer has
expired.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/route.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d2d1e153e7c4..dc9549b5eb1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1134,10 +1134,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	const struct rtable *rt = (const struct rtable *) dst;
 	unsigned int mtu = rt->rt_pmtu;
 
-	if (mtu && time_after_eq(jiffies, rt->dst.expires))
-		mtu = 0;
-
-	if (!mtu)
+	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
 		mtu = dst_metric_raw(dst, RTAX_MTU);
 
 	if (mtu && rt_is_output_route(rt))
-- 
cgit v1.2.1


From 1046716368979dee857a2b8a91c4a8833f21b9cb Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Fri, 31 Aug 2012 12:29:11 +0000
Subject: tcp: TCP Fast Open Server - header & support functions

This patch adds all the necessary data structure and support
functions to implement TFO server side. It also documents a number
of flags for the sysctl_tcp_fastopen knob, and adds a few Linux
extension MIBs.

In addition, it includes the following:

1. a new TCP_FASTOPEN socket option an application must call to
supply a max backlog allowed in order to enable TFO on its listener.

2. A number of key data structures:
"fastopen_rsk" in tcp_sock - for a big socket to access its
request_sock for retransmission and ack processing purpose. It is
non-NULL iff 3WHS not completed.

"fastopenq" in request_sock_queue - points to a per Fast Open
listener data structure "fastopen_queue" to keep track of qlen (# of
outstanding Fast Open requests) and max_qlen, among other things.

"listener" in tcp_request_sock - to point to the original listener
for book-keeping purpose, i.e., to maintain qlen against max_qlen
as part of defense against IP spoofing attack.

3. various data structure and functions, many in tcp_fastopen.c, to
support server side Fast Open cookie operations, including
/proc/sys/net/ipv4/tcp_fastopen_key to allow manual rekeying.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/proc.c            |  4 +++
 net/ipv4/sysctl_net_ipv4.c | 45 +++++++++++++++++++++++++
 net/ipv4/tcp_fastopen.c    | 83 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_input.c       |  4 +--
 4 files changed, 133 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
 	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+	SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+	SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e78c79b5586..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -232,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	return 0;
 }
 
+int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+	struct tcp_fastopen_context *ctxt;
+	int ret;
+	u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+	if (!tbl.data)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	ctxt = rcu_dereference(tcp_fastopen_ctx);
+	if (ctxt)
+		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+	rcu_read_unlock();
+
+	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+		user_key[0], user_key[1], user_key[2], user_key[3]);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+			   user_key + 2, user_key + 3) != 4) {
+			ret = -EINVAL;
+			goto bad_key;
+		}
+		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+	}
+
+bad_key:
+	pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+	       user_key[0], user_key[1], user_key[2], user_key[3],
+	       (char *)tbl.data, ret);
+	kfree(tbl.data);
+	return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -385,6 +424,12 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_fastopen_key",
+		.mode		= 0600,
+		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+		.proc_handler	= proc_tcp_fastopen_key,
+	},
 	{
 		.procname	= "tcp_tw_recycle",
 		.data		= &tcp_death_row.sysctl_tw_recycle,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
 
-int sysctl_tcp_fastopen;
+int sysctl_tcp_fastopen __read_mostly;
+
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+	struct tcp_fastopen_context *ctx =
+	    container_of(head, struct tcp_fastopen_context, rcu);
+	crypto_free_cipher(ctx->tfm);
+	kfree(ctx);
+}
+
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+	int err;
+	struct tcp_fastopen_context *ctx, *octx;
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+
+	if (IS_ERR(ctx->tfm)) {
+		err = PTR_ERR(ctx->tfm);
+error:		kfree(ctx);
+		pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+		return err;
+	}
+	err = crypto_cipher_setkey(ctx->tfm, key, len);
+	if (err) {
+		pr_err("TCP: TFO cipher key error: %d\n", err);
+		crypto_free_cipher(ctx->tfm);
+		goto error;
+	}
+	memcpy(ctx->key, key, len);
+
+	spin_lock(&tcp_fastopen_ctx_lock);
+
+	octx = rcu_dereference_protected(tcp_fastopen_ctx,
+				lockdep_is_held(&tcp_fastopen_ctx_lock));
+	rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+	spin_unlock(&tcp_fastopen_ctx_lock);
+
+	if (octx)
+		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+	return err;
+}
+
+/* Computes the fastopen cookie for the peer.
+ * The peer address is a 128 bits long (pad with zeros for IPv4).
+ *
+ * The caller must check foc->len to determine if a valid cookie
+ * has been generated successfully.
+*/
+void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+{
+	__be32 peer_addr[4] = { addr, 0, 0, 0 };
+	struct tcp_fastopen_context *ctx;
+
+	rcu_read_lock();
+	ctx = rcu_dereference(tcp_fastopen_ctx);
+	if (ctx) {
+		crypto_cipher_encrypt_one(ctx->tfm,
+					  foc->val,
+					  (__u8 *)peer_addr);
+		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+	}
+	rcu_read_unlock();
+}
 
 static int __init tcp_fastopen_init(void)
 {
+	__u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+	get_random_bytes(key, sizeof(key));
+	tcp_fastopen_reset_cipher(key, sizeof(key));
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce4ffe9ed556..d47d5fe8f3f0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -378,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 /* 4. Try to fixup all. It is made immediately after connection enters
  *    established state.
  */
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int maxwin;
@@ -4038,7 +4038,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 }
 
 /* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
 {
 	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->sk_state) {
-- 
cgit v1.2.1


From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Fri, 31 Aug 2012 12:29:12 +0000
Subject: tcp: TCP Fast Open Server - support TFO listeners

This patch builds on top of the previous patch to add the support
for TFO listeners. This includes -

1. allocating, properly initializing, and managing the per listener
fastopen_queue structure when TFO is enabled

2. changes to the inet_csk_accept code to support TFO. E.g., the
request_sock can no longer be freed upon accept(), not until 3WHS
finishes

3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg()
if it's a TFO socket

4. properly closing a TFO listener, and a TFO socket before 3WHS
finishes

5. supporting TCP_FASTOPEN socket option

6. modifying tcp_check_req() to use to check a TFO socket as well
as request_sock

7. supporting TCP's TFO cookie option

8. adding a new SYN-ACK retransmit handler to use the timer directly
off the TFO socket rather than the listener socket. Note that TFO
server side will not retransmit anything other than SYN-ACK until
the 3WHS is completed.

The patch also contains an important function
"reqsk_fastopen_remove()" to manage the somewhat complex relation
between a listener, its request_sock, and the corresponding child
socket. See the comment above the function for the detail.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c              | 28 ++++++++++++++++++-
 net/ipv4/inet_connection_sock.c | 57 ++++++++++++++++++++++++++++++++++----
 net/ipv4/syncookies.c           |  1 +
 net/ipv4/tcp.c                  | 49 ++++++++++++++++++++++++++++-----
 net/ipv4/tcp_ipv4.c             |  4 +--
 net/ipv4/tcp_minisocks.c        | 61 +++++++++++++++++++++++++++++++++--------
 net/ipv4/tcp_output.c           | 21 ++++++++++----
 net/ipv4/tcp_timer.c            | 39 +++++++++++++++++++++++++-
 8 files changed, 227 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6681ccf5c3ee..4f70ef0b946d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
 		pr_err("Attempt to release alive inet socket %p\n", sk);
 		return;
 	}
+	if (sk->sk_type == SOCK_STREAM) {
+		struct fastopen_queue *fastopenq =
+			inet_csk(sk)->icsk_accept_queue.fastopenq;
+		kfree(fastopenq);
+	}
 
 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
 	 * we can only allow the backlog to be adjusted.
 	 */
 	if (old_state != TCP_LISTEN) {
+		/* Check special setups for testing purpose to enable TFO w/o
+		 * requiring TCP_FASTOPEN sockopt.
+		 * Note that only TCP sockets (SOCK_STREAM) will reach here.
+		 * Also fastopenq may already been allocated because this
+		 * socket was in TCP_LISTEN state previously but was
+		 * shutdown() (rather than close()).
+		 */
+		if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+		    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+			if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+				err = fastopen_init_queue(sk, backlog);
+			else if ((sysctl_tcp_fastopen &
+				  TFO_SERVER_WO_SOCKOPT2) != 0)
+				err = fastopen_init_queue(sk,
+				    ((uint)sysctl_tcp_fastopen) >> 16);
+			else
+				err = 0;
+			if (err)
+				goto out;
+		}
 		err = inet_csk_listen_start(sk, backlog);
 		if (err)
 			goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	sock_rps_record_flow(sk2);
 	WARN_ON(!((1 << sk2->sk_state) &
-		  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
 
 	sock_graft(sk2, newsock);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..8464b79c493f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	struct sock *newsk;
+	struct request_sock *req;
 	int error;
 
 	lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 		goto out_err;
 
 	/* Find already established connection */
-	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+	if (reqsk_queue_empty(queue)) {
 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 
 		/* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 		if (error)
 			goto out_err;
 	}
-
-	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
-	WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+	req = reqsk_queue_remove(queue);
+	newsk = req->sk;
+
+	sk_acceptq_removed(sk);
+	if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
+		spin_lock_bh(&queue->fastopenq->lock);
+		if (tcp_rsk(req)->listener) {
+			/* We are still waiting for the final ACK from 3WHS
+			 * so can't free req now. Instead, we set req->sk to
+			 * NULL to signify that the child socket is taken
+			 * so reqsk_fastopen_remove() will free the req
+			 * when 3WHS finishes (or is aborted).
+			 */
+			req->sk = NULL;
+			req = NULL;
+		}
+		spin_unlock_bh(&queue->fastopenq->lock);
+	}
 out:
 	release_sock(sk);
+	if (req)
+		__reqsk_free(req);
 	return newsk;
 out_err:
 	newsk = NULL;
+	req = NULL;
 	*err = error;
 	goto out;
 }
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 void inet_csk_listen_stop(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	struct request_sock *acc_req;
 	struct request_sock *req;
 
 	inet_csk_delete_keepalive_timer(sk);
 
 	/* make all the listen_opt local to us */
-	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+	acc_req = reqsk_queue_yank_acceptq(queue);
 
 	/* Following specs, it would be better either to send FIN
 	 * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
 	 * To be honest, we are not able to make either
 	 * of the variants now.			--ANK
 	 */
-	reqsk_queue_destroy(&icsk->icsk_accept_queue);
+	reqsk_queue_destroy(queue);
 
 	while ((req = acc_req) != NULL) {
 		struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		percpu_counter_inc(sk->sk_prot->orphan_count);
 
+		if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
+			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+			BUG_ON(sk != tcp_rsk(req)->listener);
+
+			/* Paranoid, to prevent race condition if
+			 * an inbound pkt destined for child is
+			 * blocked by sock lock in tcp_v4_rcv().
+			 * Also to satisfy an assertion in
+			 * tcp_v4_destroy_sock().
+			 */
+			tcp_sk(child)->fastopen_rsk = NULL;
+			sock_put(sk);
+		}
 		inet_csk_destroy_sock(child);
 
 		bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
 		sk_acceptq_removed(sk);
 		__reqsk_free(req);
 	}
+	if (queue->fastopenq != NULL) {
+		/* Free all the reqs queued in rskq_rst_head. */
+		spin_lock_bh(&queue->fastopenq->lock);
+		acc_req = queue->fastopenq->rskq_rst_head;
+		queue->fastopenq->rskq_rst_head = NULL;
+		spin_unlock_bh(&queue->fastopenq->lock);
+		while ((req = acc_req) != NULL) {
+			acc_req = req->dl_next;
+			__reqsk_free(req);
+		}
+	}
 	WARN_ON(sk->sk_ack_backlog);
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+	treq->listener		= NULL;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..df83d744e380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 
-	/* Connected? */
-	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+	/* Connected or passive Fast Open socket? */
+	if (sk->sk_state != TCP_SYN_SENT &&
+	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
 		int target = sock_rcvlowat(sk, 0, INT_MAX);
 
 		if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
-	/* Wait for a connection to finish. */
-	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+	/* Wait for a connection to finish. One exception is TCP Fast Open
+	 * (passive side) where data is allowed to be sent before a connection
+	 * is fully established.
+	 */
+	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+	    !tcp_passive_fastopen(sk)) {
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 			goto out_err;
+	}
 
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
-	/* Wait for a connection to finish. */
-	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+	/* Wait for a connection to finish. One exception is TCP Fast Open
+	 * (passive side) where data is allowed to be sent before a connection
+	 * is fully established.
+	 */
+	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+	    !tcp_passive_fastopen(sk)) {
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 			goto do_error;
+	}
 
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
 		 * they look as CLOSING or LAST_ACK for Linux)
 		 * Probably, I missed some more holelets.
 		 * 						--ANK
+		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+		 * in a single packet! (May consider it later but will
+		 * probably need API support or TCP_CORK SYN-ACK until
+		 * data is written and socket is closed.)
 		 */
 		tcp_send_fin(sk);
 	}
@@ -2215,8 +2230,16 @@ adjudge_to_death:
 		}
 	}
 
-	if (sk->sk_state == TCP_CLOSE)
+	if (sk->sk_state == TCP_CLOSE) {
+		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+		/* We could get here with a non-NULL req if the socket is
+		 * aborted (e.g., closed with unread data) before 3WHS
+		 * finishes.
+		 */
+		if (req != NULL)
+			reqsk_fastopen_remove(sk, req, false);
 		inet_csk_destroy_sock(sk);
+	}
 	/* Otherwise, socket is reprieved until protocol close. */
 
 out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			icsk->icsk_user_timeout = msecs_to_jiffies(val);
 		break;
+
+	case TCP_FASTOPEN:
+		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+		    TCPF_LISTEN)))
+			err = fastopen_init_queue(sk, val);
+		else
+			err = -EINVAL;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
 
 void tcp_done(struct sock *sk)
 {
+	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 
 	tcp_set_state(sk, TCP_CLOSE);
 	tcp_clear_xmit_timers(sk);
+	if (req != NULL)
+		reqsk_fastopen_remove(sk, req, false);
 
 	sk->sk_shutdown = SHUTDOWN_MASK;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 36f02f954ac1..bb148dee1edd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 		return -1;
 
-	skb = tcp_make_synack(sk, dst, req, rvp);
+	skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
 						       iph->saddr, iph->daddr);
 	if (req)
-		return tcp_check_req(sk, skb, req, prev);
+		return tcp_check_req(sk, skb, req, prev, false);
 
 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
 			th->source, iph->daddr, th->dest, inet_iif(skb));
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..e965319d610b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
+		newtp->fastopen_rsk = NULL;
 
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 EXPORT_SYMBOL(tcp_create_openreq_child);
 
 /*
- *	Process an incoming packet for SYN_RECV sockets represented
- *	as a request_sock.
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
  */
 
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req,
-			   struct request_sock **prev)
+			   struct request_sock **prev,
+			   bool fastopen)
 {
 	struct tcp_options_received tmp_opt;
 	const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	bool paws_reject = false;
 
+	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
 		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		 *
 		 * Enforce "SYN-ACK" according to figure 8, figure 6
 		 * of RFC793, fixed by RFC1122.
+		 *
+		 * Note that even if there is new data in the SYN packet
+		 * they will be thrown away too.
 		 */
 		req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 		return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	 *                  sent (the segment carries an unacceptable ACK) ...
 	 *                  a reset is sent."
 	 *
-	 * Invalid ACK: reset will be sent by listening socket
+	 * Invalid ACK: reset will be sent by listening socket.
+	 * Note that the ACK validity check for a Fast Open socket is done
+	 * elsewhere and is checked directly against the child socket rather
+	 * than req because user data may have been sent out.
 	 */
-	if ((flg & TCP_FLAG_ACK) &&
+	if ((flg & TCP_FLAG_ACK) && !fastopen &&
 	    (TCP_SKB_CB(skb)->ack_seq !=
 	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
 		return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	/* RFC793: "first check sequence number". */
 
 	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
 			req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	/* In sequence, PAWS is OK. */
 
-	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
 		req->ts_recent = tmp_opt.rcv_tsval;
 
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	/* ACK sequence verified above, just make sure ACK is
 	 * set.  If ACK not set, just silently drop the packet.
+	 *
+	 * XXX (TFO) - if we ever allow "data after SYN", the
+	 * following check needs to be removed.
 	 */
 	if (!(flg & TCP_FLAG_ACK))
 		return NULL;
 
+	/* For Fast Open no more processing is needed (sk is the
+	 * child socket).
+	 */
+	if (fastopen)
+		return sk;
+
 	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
 	if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
 	}
 
 embryonic_reset:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
-	if (!(flg & TCP_FLAG_RST))
+	if (!(flg & TCP_FLAG_RST)) {
+		/* Received a bad SYN pkt - for TFO We try not to reset
+		 * the local connection unless it's really necessary to
+		 * avoid becoming vulnerable to outside attack aiming at
+		 * resetting legit local connections.
+		 */
 		req->rsk_ops->send_reset(sk, skb);
-
-	inet_csk_reqsk_queue_drop(sk, req, prev);
+	} else if (fastopen) { /* received a valid RST pkt */
+		reqsk_fastopen_remove(sk, req, true);
+		tcp_reset(sk);
+	}
+	if (!fastopen) {
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+	}
 	return NULL;
 }
 EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
  * Queue segment on the new socket if the new socket is active,
  * otherwise we just shortcircuit this and continue with
  * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
  */
 
 int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..9383b51f3efc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
 				   unsigned int mss, struct sk_buff *skb,
 				   struct tcp_out_options *opts,
 				   struct tcp_md5sig_key **md5,
-				   struct tcp_extend_values *xvp)
+				   struct tcp_extend_values *xvp,
+				   struct tcp_fastopen_cookie *foc)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
 		if (unlikely(!ireq->tstamp_ok))
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
-
+	if (foc != NULL) {
+		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = foc;
+			remaining -= need;
+		}
+	}
 	/* Similar rationale to tcp_syn_options() applies here, too.
 	 * If the <SYN> options fit, the same options should fit now!
 	 */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
  */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				struct request_sock *req,
-				struct request_values *rvp)
+				struct request_values *rvp,
+				struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_out_options opts;
 	struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 #endif
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	tcp_header_size = tcp_synack_options(sk, req, mss,
-					     skb, &opts, &md5, xvp)
+					     skb, &opts, &md5, xvp, foc)
 			+ sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	}
 
 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
-	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+	/* XXX data is queued and acked as is. No buffer/window check */
+	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -304,6 +304,35 @@ static void tcp_probe_timer(struct sock *sk)
 	}
 }
 
+/*
+ *	Timer for Fast Open socket to retransmit SYNACK. Note that the
+ *	sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int max_retries = icsk->icsk_syn_retries ? :
+	    sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+	struct request_sock *req;
+
+	req = tcp_sk(sk)->fastopen_rsk;
+	req->rsk_ops->syn_ack_timeout(sk, req);
+
+	if (req->retrans >= max_retries) {
+		tcp_write_err(sk);
+		return;
+	}
+	/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+	 * returned from rtx_syn_ack() to make it more persistent like
+	 * regular retransmit because if the child socket has been accepted
+	 * it's not good to give up too easily.
+	 */
+	req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+	req->retrans++;
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+			  TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
+}
+
 /*
  *	The TCP retransmit timer.
  */
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
 		tcp_resume_early_retransmit(sk);
 		return;
 	}
-
+	if (tp->fastopen_rsk) {
+		BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+		    sk->sk_state != TCP_FIN_WAIT1);
+		tcp_fastopen_synack_timer(sk);
+		/* Before we receive ACK to our SYN-ACK don't retransmit
+		 * anything else (e.g., data or FIN segments).
+		 */
+		return;
+	}
 	if (!tp->packets_out)
 		goto out;
 
-- 
cgit v1.2.1


From 168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Fri, 31 Aug 2012 12:29:13 +0000
Subject: tcp: TCP Fast Open Server - main code path

This patch adds the main processing path to complete the TFO server
patches.

A TFO request (i.e., SYN+data packet with a TFO cookie option) first
gets processed in tcp_v4_conn_request(). If it passes the various TFO
checks by tcp_fastopen_check(), a child socket will be created right
away to be accepted by applications, rather than waiting for the 3WHS
to finish.

In additon to the use of TFO cookie, a simple max_qlen based scheme
is put in place to fend off spoofed TFO attack.

When a valid ACK comes back to tcp_rcv_state_process(), it will cause
the state of the child socket to switch from either TCP_SYN_RECV to
TCP_ESTABLISHED, or TCP_FIN_WAIT1 to TCP_FIN_WAIT2. At this time
retransmission will resume for any unack'ed (data, FIN,...) segments.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c |  71 +++++++++++---
 net/ipv4/tcp_ipv4.c  | 265 ++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 309 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d47d5fe8f3f0..8c304a400798 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3127,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	/* If the retrans timer is currently being used by Fast Open
+	 * for SYN-ACK retrans purpose, stay put.
+	 */
+	if (tp->fastopen_rsk)
+		return;
+
 	if (!tp->packets_out) {
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
@@ -5895,7 +5901,9 @@ discard:
 		tcp_send_synack(sk);
 #if 0
 		/* Note, we could accept data and URG from this segment.
-		 * There are no obstacles to make this.
+		 * There are no obstacles to make this (except that we must
+		 * either change tcp_recvmsg() to prevent it from returning data
+		 * before 3WHS completes per RFC793, or employ TCP Fast Open).
 		 *
 		 * However, if we ignore data in ACKless segments sometimes,
 		 * we have no reasons to accept it sometimes.
@@ -5935,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *req;
 	int queued = 0;
 
 	tp->rx_opt.saw_tstamp = 0;
@@ -5990,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		return 0;
 	}
 
-	if (!tcp_validate_incoming(sk, skb, th, 0))
+	req = tp->fastopen_rsk;
+	if (req != NULL) {
+		BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+		    sk->sk_state != TCP_FIN_WAIT1);
+
+		if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+			goto discard;
+	} else if (!tcp_validate_incoming(sk, skb, th, 0))
 		return 0;
 
 	/* step 5: check the ACK field */
@@ -6000,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		switch (sk->sk_state) {
 		case TCP_SYN_RECV:
 			if (acceptable) {
-				tp->copied_seq = tp->rcv_nxt;
+				/* Once we leave TCP_SYN_RECV, we no longer
+				 * need req so release it.
+				 */
+				if (req) {
+					reqsk_fastopen_remove(sk, req, false);
+				} else {
+					/* Make sure socket is routed, for
+					 * correct metrics.
+					 */
+					icsk->icsk_af_ops->rebuild_header(sk);
+					tcp_init_congestion_control(sk);
+
+					tcp_mtup_init(sk);
+					tcp_init_buffer_space(sk);
+					tp->copied_seq = tp->rcv_nxt;
+				}
 				smp_mb();
 				tcp_set_state(sk, TCP_ESTABLISHED);
 				sk->sk_state_change(sk);
@@ -6022,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
 
-				/* Make sure socket is routed, for
-				 * correct metrics.
-				 */
-				icsk->icsk_af_ops->rebuild_header(sk);
-
-				tcp_init_metrics(sk);
-
-				tcp_init_congestion_control(sk);
+				if (req) {
+					/* Re-arm the timer because data may
+					 * have been sent out. This is similar
+					 * to the regular data transmission case
+					 * when new data has just been ack'ed.
+					 *
+					 * (TFO) - we could try to be more
+					 * aggressive and retranmitting any data
+					 * sooner based on when they were sent
+					 * out.
+					 */
+					tcp_rearm_rto(sk);
+				} else
+					tcp_init_metrics(sk);
 
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
 				 */
 				tp->lsndtime = tcp_time_stamp;
 
-				tcp_mtup_init(sk);
 				tcp_initialize_rcv_mss(sk);
-				tcp_init_buffer_space(sk);
 				tcp_fast_path_on(tp);
 			} else {
 				return 1;
@@ -6046,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			break;
 
 		case TCP_FIN_WAIT1:
+			/* If we enter the TCP_FIN_WAIT1 state and we are a
+			 * Fast Open socket and this is the first acceptable
+			 * ACK we have received, this would have acknowledged
+			 * our SYNACK so stop the SYNACK timer.
+			 */
+			if (acceptable && req != NULL) {
+				/* We no longer need the request sock. */
+				reqsk_fastopen_remove(sk, req, false);
+				tcp_rearm_rto(sk);
+			}
 			if (tp->snd_una == tp->write_seq) {
 				struct dst_entry *dst;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bb148dee1edd..e64abed249cc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	const int code = icmp_hdr(icmp_skb)->code;
 	struct sock *sk;
 	struct sk_buff *skb;
+	struct request_sock *req;
 	__u32 seq;
 	__u32 remaining;
 	int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
 	icsk = inet_csk(sk);
 	tp = tcp_sk(sk);
+	req = tp->fastopen_rsk;
 	seq = ntohl(th->seq);
 	if (sk->sk_state != TCP_LISTEN &&
-	    !between(seq, tp->snd_una, tp->snd_nxt)) {
+	    !between(seq, tp->snd_una, tp->snd_nxt) &&
+	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
+		/* For a Fast Open socket, allow seq to be snt_isn. */
 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 		goto out;
 	}
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		    !icsk->icsk_backoff)
 			break;
 
+		/* XXX (TFO) - revisit the following logic for TFO */
+
 		if (sock_owned_by_user(sk))
 			break;
 
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		goto out;
 	}
 
+	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather
+	 * than following the TCP_SYN_RECV case and closing the socket,
+	 * we ignore the ICMP error and keep trying like a fully established
+	 * socket. Is this the right thing to do?
+	 */
+	if (req && req->sk == NULL)
+		goto out;
+
 	switch (sk->sk_state) {
 		struct request_sock *req, **prev;
 	case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
 	case TCP_SYN_SENT:
 	case TCP_SYN_RECV:  /* Cannot happen.
-			       It can f.e. if SYNs crossed.
+			       It can f.e. if SYNs crossed,
+			       or Fast Open.
 			     */
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req)
 {
-	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
-			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+	 */
+	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 			req->ts_recent,
 			0,
 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
 
+static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
+			       struct request_sock *req,
+			       struct tcp_fastopen_cookie *foc,
+			       struct tcp_fastopen_cookie *valid_foc)
+{
+	bool skip_cookie = false;
+	struct fastopen_queue *fastopenq;
+
+	if (likely(!fastopen_cookie_present(foc))) {
+		/* See include/net/tcp.h for the meaning of these knobs */
+		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
+		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
+		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
+			skip_cookie = true; /* no cookie to validate */
+		else
+			return false;
+	}
+	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+	/* A FO option is present; bump the counter. */
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
+
+	/* Make sure the listener has enabled fastopen, and we don't
+	 * exceed the max # of pending TFO requests allowed before trying
+	 * to validating the cookie in order to avoid burning CPU cycles
+	 * unnecessarily.
+	 *
+	 * XXX (TFO) - The implication of checking the max_qlen before
+	 * processing a cookie request is that clients can't differentiate
+	 * between qlen overflow causing Fast Open to be disabled
+	 * temporarily vs a server not supporting Fast Open at all.
+	 */
+	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
+	    fastopenq == NULL || fastopenq->max_qlen == 0)
+		return false;
+
+	if (fastopenq->qlen >= fastopenq->max_qlen) {
+		struct request_sock *req1;
+		spin_lock(&fastopenq->lock);
+		req1 = fastopenq->rskq_rst_head;
+		if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+			spin_unlock(&fastopenq->lock);
+			NET_INC_STATS_BH(sock_net(sk),
+			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
+			foc->len = -1;
+			return false;
+		}
+		fastopenq->rskq_rst_head = req1->dl_next;
+		fastopenq->qlen--;
+		spin_unlock(&fastopenq->lock);
+		reqsk_free(req1);
+	}
+	if (skip_cookie) {
+		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		return true;
+	}
+	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
+		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
+			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
+			    memcmp(&foc->val[0], &valid_foc->val[0],
+			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
+				return false;
+			valid_foc->len = -1;
+		}
+		/* Acknowledge the data received from the peer. */
+		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		return true;
+	} else if (foc->len == 0) { /* Client requesting a cookie */
+		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+		NET_INC_STATS_BH(sock_net(sk),
+		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+	} else {
+		/* Client sent a cookie with wrong size. Treat it
+		 * the same as invalid and return a valid one.
+		 */
+		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+	}
+	return false;
+}
+
+static int tcp_v4_conn_req_fastopen(struct sock *sk,
+				    struct sk_buff *skb,
+				    struct sk_buff *skb_synack,
+				    struct request_sock *req,
+				    struct request_values *rvp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct sock *child;
+
+	req->retrans = 0;
+	req->sk = NULL;
+
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+	if (child == NULL) {
+		NET_INC_STATS_BH(sock_net(sk),
+				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+		kfree_skb(skb_synack);
+		return -1;
+	}
+	ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+			ireq->rmt_addr, ireq->opt);
+	/* XXX (TFO) - is it ok to ignore error and continue? */
+
+	spin_lock(&queue->fastopenq->lock);
+	queue->fastopenq->qlen++;
+	spin_unlock(&queue->fastopenq->lock);
+
+	/* Initialize the child socket. Have to fix some values to take
+	 * into account the child is a Fast Open socket and is created
+	 * only out of the bits carried in the SYN packet.
+	 */
+	tp = tcp_sk(child);
+
+	tp->fastopen_rsk = req;
+	/* Do a hold on the listner sk so that if the listener is being
+	 * closed, the child that has been accepted can live on and still
+	 * access listen_lock.
+	 */
+	sock_hold(sk);
+	tcp_rsk(req)->listener = sk;
+
+	/* RFC1323: The window in SYN & SYN/ACK segments is never
+	 * scaled. So correct it appropriately.
+	 */
+	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+	/* Activate the retrans timer so that SYNACK can be retransmitted.
+	 * The request socket is not added to the SYN table of the parent
+	 * because it's been added to the accept queue directly.
+	 */
+	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+	    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+	/* Add the child socket directly into the accept queue */
+	inet_csk_reqsk_queue_add(sk, req, child);
+
+	/* Now finish processing the fastopen child socket. */
+	inet_csk(child)->icsk_af_ops->rebuild_header(child);
+	tcp_init_congestion_control(child);
+	tcp_mtup_init(child);
+	tcp_init_buffer_space(child);
+	tcp_init_metrics(child);
+
+	/* Queue the data carried in the SYN packet. We need to first
+	 * bump skb's refcnt because the caller will attempt to free it.
+	 *
+	 * XXX (TFO) - we honor a zero-payload TFO request for now.
+	 * (Any reason not to?)
+	 */
+	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
+		/* Don't queue the skb if there is no payload in SYN.
+		 * XXX (TFO) - How about SYN+FIN?
+		 */
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	} else {
+		skb = skb_get(skb);
+		skb_dst_drop(skb);
+		__skb_pull(skb, tcp_hdr(skb)->doff * 4);
+		skb_set_owner_r(skb, child);
+		__skb_queue_tail(&child->sk_receive_queue, skb);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	}
+	sk->sk_data_ready(sk, 0);
+	bh_unlock_sock(child);
+	sock_put(child);
+	WARN_ON(req->sk == NULL);
+	return 0;
+}
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_extend_values tmp_ext;
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	__be32 daddr = ip_hdr(skb)->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
 	bool want_cookie = false;
+	struct flowi4 fl4;
+	struct tcp_fastopen_cookie foc = { .len = -1 };
+	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+	struct sk_buff *skb_synack;
+	int do_fastopen;
 
 	/* Never answer to SYNs send to broadcast or multicast */
 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
+	    want_cookie ? NULL : &foc);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
-		struct flowi4 fl4;
-
 		/* VJ's idea. We save last timestamp seen
 		 * from the destination in peer table, when entering
 		 * state TIME-WAIT, and check against it before
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
-	if (tcp_v4_send_synack(sk, dst, req,
-			       (struct request_values *)&tmp_ext,
-			       skb_get_queue_mapping(skb),
-			       want_cookie) ||
-	    want_cookie)
+	if (dst == NULL) {
+		dst = inet_csk_route_req(sk, &fl4, req);
+		if (dst == NULL)
+			goto drop_and_free;
+	}
+	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
+
+	/* We don't call tcp_v4_send_synack() directly because we need
+	 * to make sure a child socket can be created successfully before
+	 * sending back synack!
+	 *
+	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
+	 * (or better yet, call tcp_send_synack() in the child context
+	 * directly, but will have to fix bunch of other code first)
+	 * after syn_recv_sock() except one will need to first fix the
+	 * latter to remove its dependency on the current implementation
+	 * of tcp_v4_send_synack()->tcp_select_initial_window().
+	 */
+	skb_synack = tcp_make_synack(sk, dst, req,
+	    (struct request_values *)&tmp_ext,
+	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
+
+	if (skb_synack) {
+		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
+		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
+	} else
+		goto drop_and_free;
+
+	if (likely(!do_fastopen)) {
+		int err;
+		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+		     ireq->rmt_addr, ireq->opt);
+		err = net_xmit_eval(err);
+		if (err || want_cookie)
+			goto drop_and_free;
+
+		tcp_rsk(req)->listener = NULL;
+		/* Add the request_sock to the SYN table */
+		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		if (fastopen_cookie_present(&foc) && foc.len != 0)
+			NET_INC_STATS_BH(sock_net(sk),
+			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
+	    (struct request_values *)&tmp_ext))
 		goto drop_and_free;
 
-	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	return 0;
 
 drop_and_release:
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 			 tcp_cookie_values_release);
 		tp->cookie_values = NULL;
 	}
+	BUG_ON(tp->fastopen_rsk != NULL);
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
+	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
 	__be32 dest = inet->inet_daddr;
 	__be32 src = inet->inet_rcv_saddr;
 	__u16 destp = ntohs(inet->inet_dport);
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		jiffies_to_clock_t(icsk->icsk_ack.ato),
 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
 		tp->snd_cwnd,
-		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
+		sk->sk_state == TCP_LISTEN ?
+		    (fastopenq ? fastopenq->max_qlen : 0) :
+		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
 		len);
 }
 
-- 
cgit v1.2.1


From ce9f3f31efb88841e4df98794b13dbac8c4901da Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 18 Jun 2012 00:18:31 +0000
Subject: netfilter: properly annotate ipv4_netfilter_{init,fini}()

Despite being just a few bytes of code, they should still have proper
annotations.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index f1643c0c3587..4c0cf63dd92e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -188,12 +188,12 @@ static const struct nf_afinfo nf_ip_afinfo = {
 	.route_key_size		= sizeof(struct ip_rt_info),
 };
 
-static int ipv4_netfilter_init(void)
+static int __init ipv4_netfilter_init(void)
 {
 	return nf_register_afinfo(&nf_ip_afinfo);
 }
 
-static void ipv4_netfilter_fini(void)
+static void __exit ipv4_netfilter_fini(void)
 {
 	nf_unregister_afinfo(&nf_ip_afinfo);
 }
-- 
cgit v1.2.1


From 09484d1f6edb3b798c5926a6755f1badb77ece9f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sun, 2 Sep 2012 17:38:02 +0000
Subject: tcp: move tcp_enter_cwr()

To prepare replacing rate halving with PRR algorithm in CWR state.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8c304a400798..3ab0c7573a0b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -743,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-
-	tp->prior_ssthresh = 0;
-	tp->bytes_acked = 0;
-	if (icsk->icsk_ca_state < TCP_CA_CWR) {
-		tp->undo_marker = 0;
-		if (set_ssthresh)
-			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-		tp->snd_cwnd = min(tp->snd_cwnd,
-				   tcp_packets_in_flight(tp) + 1U);
-		tp->snd_cwnd_cnt = 0;
-		tp->high_seq = tp->snd_nxt;
-		tp->snd_cwnd_stamp = tcp_time_stamp;
-		TCP_ECN_queue_cwr(tp);
-
-		tcp_set_ca_state(sk, TCP_CA_CWR);
-	}
-}
-
 /*
  * Packet counting of FACK is based on in-order assumptions, therefore TCP
  * disables it when reordering is detected
@@ -2741,6 +2718,29 @@ static inline void tcp_complete_cwr(struct sock *sk)
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tp->prior_ssthresh = 0;
+	tp->bytes_acked = 0;
+	if (icsk->icsk_ca_state < TCP_CA_CWR) {
+		tp->undo_marker = 0;
+		if (set_ssthresh)
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tp->snd_cwnd = min(tp->snd_cwnd,
+				   tcp_packets_in_flight(tp) + 1U);
+		tp->snd_cwnd_cnt = 0;
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+		TCP_ECN_queue_cwr(tp);
+
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+	}
+}
+
 static void tcp_try_keep_open(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.1


From fb4d3d1df31907eadd2e2a745e840921888b346a Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sun, 2 Sep 2012 17:38:03 +0000
Subject: tcp: move tcp_update_cwnd_in_recovery

To prepare replacing rate halving with PRR algorithm in CWR state.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 64 ++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ab0c7573a0b..38589e464e63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2700,6 +2700,38 @@ static bool tcp_try_undo_loss(struct sock *sk)
 	return false;
 }
 
+/* This function implements the PRR algorithm, specifcally the PRR-SSRB
+ * (proportional rate reduction with slow start reduction bound) as described in
+ * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *	cwnd reductions across a full RTT.
+ *   2) If packets in flight is lower than ssthresh (such as due to excess
+ *	losses and/or application stalls), do not perform any further cwnd
+ *	reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
+					int fast_rexmit, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int sndcnt = 0;
+	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+
+	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+			       tp->prior_cwnd - 1;
+		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+	} else {
+		sndcnt = min_t(int, delta,
+			       max_t(int, tp->prr_delivered - tp->prr_out,
+				     newly_acked_sacked) + 1);
+	}
+
+	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+
 static inline void tcp_complete_cwr(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2854,38 +2886,6 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
 
-/* This function implements the PRR algorithm, specifcally the PRR-SSRB
- * (proportional rate reduction with slow start reduction bound) as described in
- * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
- * It computes the number of packets to send (sndcnt) based on packets newly
- * delivered:
- *   1) If the packets in flight is larger than ssthresh, PRR spreads the
- *	cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *	losses and/or application stalls), do not perform any further cwnd
- *	reductions, but instead slow start up to ssthresh.
- */
-static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
-					int fast_rexmit, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	int sndcnt = 0;
-	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-
-	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
-		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-			       tp->prior_cwnd - 1;
-		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-	} else {
-		sndcnt = min_t(int, delta,
-			       max_t(int, tp->prr_delivered - tp->prr_out,
-				     newly_acked_sacked) + 1);
-	}
-
-	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
-	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
-}
-
 static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.1


From 684bad1107571d35610a674c61b3544efb5a5b13 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sun, 2 Sep 2012 17:38:04 +0000
Subject: tcp: use PRR to reduce cwin in CWR state

Use proportional rate reduction (PRR) algorithm to reduce cwnd in CWR state,
in addition to Recovery state. Retire the current rate-halving in CWR.
When losses are detected via ACKs in CWR state, the sender enters Recovery
state but the cwnd reduction continues and does not restart.

Rename and refactor cwnd reduction functions since both CWR and Recovery
use the same algorithm:
tcp_init_cwnd_reduction() is new and initiates reduction state variables.
tcp_cwnd_reduction() is previously tcp_update_cwnd_in_recovery().
tcp_ends_cwnd_reduction() is previously  tcp_complete_cwr().

The rate halving functions and logic such as tcp_cwnd_down(), tcp_min_cwnd(),
and the cwnd moderation inside tcp_enter_cwr() are removed. The unused
parameter, flag, in tcp_cwnd_reduction() is also removed.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 119 +++++++++++++++++---------------------------------
 net/ipv4/tcp_output.c |   6 +--
 2 files changed, 44 insertions(+), 81 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38589e464e63..e2bec815ff23 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2470,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
-	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-
-	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	int decr = tp->snd_cwnd_cnt + 1;
-
-	if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
-	    (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
-		tp->snd_cwnd_cnt = decr & 1;
-		decr >>= 1;
-
-		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
-			tp->snd_cwnd -= decr;
-
-		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-		tp->snd_cwnd_stamp = tcp_time_stamp;
-	}
-}
-
 /* Nothing was retransmitted or returned timestamp is less
  * than timestamp of the first retransmission.
  */
@@ -2700,9 +2671,8 @@ static bool tcp_try_undo_loss(struct sock *sk)
 	return false;
 }
 
-/* This function implements the PRR algorithm, specifcally the PRR-SSRB
- * (proportional rate reduction with slow start reduction bound) as described in
- * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
  * It computes the number of packets to send (sndcnt) based on packets newly
  * delivered:
  *   1) If the packets in flight is larger than ssthresh, PRR spreads the
@@ -2711,13 +2681,29 @@ static bool tcp_try_undo_loss(struct sock *sk)
  *	losses and/or application stalls), do not perform any further cwnd
  *	reductions, but instead slow start up to ssthresh.
  */
-static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
-					int fast_rexmit, int flag)
+static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->high_seq = tp->snd_nxt;
+	tp->bytes_acked = 0;
+	tp->snd_cwnd_cnt = 0;
+	tp->prior_cwnd = tp->snd_cwnd;
+	tp->prr_delivered = 0;
+	tp->prr_out = 0;
+	if (set_ssthresh)
+		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+	TCP_ECN_queue_cwr(tp);
+}
+
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+			       int fast_rexmit)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
 	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
 
+	tp->prr_delivered += newly_acked_sacked;
 	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
 			       tp->prior_cwnd - 1;
@@ -2732,43 +2718,29 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
 	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
 }
 
-static inline void tcp_complete_cwr(struct sock *sk)
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
-	if (tp->undo_marker) {
-		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
-			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-			tp->snd_cwnd_stamp = tcp_time_stamp;
-		} else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
-			/* PRR algorithm. */
-			tp->snd_cwnd = tp->snd_ssthresh;
-			tp->snd_cwnd_stamp = tcp_time_stamp;
-		}
+	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+		tp->snd_cwnd = tp->snd_ssthresh;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
 	}
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
-/* Set slow start threshold and cwnd not falling to slow start */
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
 void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tp->prior_ssthresh = 0;
 	tp->bytes_acked = 0;
-	if (icsk->icsk_ca_state < TCP_CA_CWR) {
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		tp->undo_marker = 0;
-		if (set_ssthresh)
-			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-		tp->snd_cwnd = min(tp->snd_cwnd,
-				   tcp_packets_in_flight(tp) + 1U);
-		tp->snd_cwnd_cnt = 0;
-		tp->high_seq = tp->snd_nxt;
-		tp->snd_cwnd_stamp = tcp_time_stamp;
-		TCP_ECN_queue_cwr(tp);
-
+		tcp_init_cwnd_reduction(sk, set_ssthresh);
 		tcp_set_ca_state(sk, TCP_CA_CWR);
 	}
 }
@@ -2787,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
 	}
 }
 
-static void tcp_try_to_open(struct sock *sk, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2804,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
 			tcp_moderate_cwnd(tp);
 	} else {
-		tcp_cwnd_down(sk, flag);
+		tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
 	}
 }
 
@@ -2898,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 
 	NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
-	tp->high_seq = tp->snd_nxt;
 	tp->prior_ssthresh = 0;
 	tp->undo_marker = tp->snd_una;
 	tp->undo_retrans = tp->retrans_out;
@@ -2906,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		if (!ece_ack)
 			tp->prior_ssthresh = tcp_current_ssthresh(sk);
-		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-		TCP_ECN_queue_cwr(tp);
+		tcp_init_cwnd_reduction(sk, true);
 	}
-
-	tp->bytes_acked = 0;
-	tp->snd_cwnd_cnt = 0;
-	tp->prior_cwnd = tp->snd_cwnd;
-	tp->prr_delivered = 0;
-	tp->prr_out = 0;
 	tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
 
@@ -2974,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 			/* CWR is to be held something *above* high_seq
 			 * is ACKed for CWR bit to reach receiver. */
 			if (tp->snd_una != tp->high_seq) {
-				tcp_complete_cwr(sk);
+				tcp_end_cwnd_reduction(sk);
 				tcp_set_ca_state(sk, TCP_CA_Open);
 			}
 			break;
@@ -2984,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 				tcp_reset_reno_sack(tp);
 			if (tcp_try_undo_recovery(sk))
 				return;
-			tcp_complete_cwr(sk);
+			tcp_end_cwnd_reduction(sk);
 			break;
 		}
 	}
@@ -3025,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 			tcp_try_undo_dsack(sk);
 
 		if (!tcp_time_to_recover(sk, flag)) {
-			tcp_try_to_open(sk, flag);
+			tcp_try_to_open(sk, flag, newly_acked_sacked);
 			return;
 		}
 
@@ -3047,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 
 	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
 		tcp_update_scoreboard(sk, fast_rexmit);
-	tp->prr_delivered += newly_acked_sacked;
-	tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
+	tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
 	tcp_xmit_retransmit_queue(sk);
 }
 
@@ -3394,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+		!tcp_in_cwnd_reduction(sk);
 }
 
 /* Check that window update is acceptable.
@@ -3462,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 }
 
 /* A conservative spurious RTO response algorithm: reduce cwnd using
- * rate halving and continue in congestion avoidance.
+ * PRR and continue in congestion avoidance.
  */
-static void tcp_ratehalving_spur_to_response(struct sock *sk)
+static void tcp_cwr_spur_to_response(struct sock *sk)
 {
 	tcp_enter_cwr(sk, 0);
 }
@@ -3472,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
 static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 {
 	if (flag & FLAG_ECE)
-		tcp_ratehalving_spur_to_response(sk);
+		tcp_cwr_spur_to_response(sk);
 	else
 		tcp_undo_cwr(sk, true);
 }
@@ -3579,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
 			tcp_conservative_spur_to_response(tp);
 			break;
 		default:
-			tcp_ratehalving_spur_to_response(sk);
+			tcp_cwr_spur_to_response(sk);
 			break;
 		}
 		tp->frto_counter = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9383b51f3efc..cfe6ffe1c177 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2037,10 +2037,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		if (push_one)
 			break;
 	}
-	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
-		tp->prr_out += sent_pkts;
 
 	if (likely(sent_pkts)) {
+		if (tcp_in_cwnd_reduction(sk))
+			tp->prr_out += sent_pkts;
 		tcp_cwnd_validate(sk);
 		return false;
 	}
@@ -2542,7 +2542,7 @@ begin_fwd:
 		}
 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
-		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
+		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
 		if (skb == tcp_write_queue_head(sk))
-- 
cgit v1.2.1


From d23ff701643a4a725e2c7a8ba2d567d39daa29ea Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 4 Sep 2012 11:03:15 +0000
Subject: tcp: add generic netlink support for tcp_metrics

Add support for genl "tcp_metrics". No locking
is changed, only that now we can unlink and delete
entries after grace period. We implement get/del for
single entry and dump to support show/flush filtering
in user space. Del without address attribute causes
flush for all addresses, sadly under genl_mutex.

v2:
- remove rcu_assign_pointer as suggested by Eric Dumazet,
it is not needed because there are no other writes under lock
- move the flushing code in tcp_metrics_flush_all

v3:
- remove synchronize_rcu on flush as suggested by Eric Dumazet

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 354 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 341 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0abe67bb4d3a..988edb63ee73 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/tcp.h>
 #include <linux/hash.h>
+#include <linux/tcp_metrics.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/net_namespace.h>
@@ -17,20 +18,10 @@
 #include <net/ipv6.h>
 #include <net/dst.h>
 #include <net/tcp.h>
+#include <net/genetlink.h>
 
 int sysctl_tcp_nometrics_save __read_mostly;
 
-enum tcp_metric_index {
-	TCP_METRIC_RTT,
-	TCP_METRIC_RTTVAR,
-	TCP_METRIC_SSTHRESH,
-	TCP_METRIC_CWND,
-	TCP_METRIC_REORDERING,
-
-	/* Always last.  */
-	TCP_METRIC_MAX,
-};
-
 struct tcp_fastopen_metrics {
 	u16	mss;
 	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
@@ -45,8 +36,10 @@ struct tcp_metrics_block {
 	u32				tcpm_ts;
 	u32				tcpm_ts_stamp;
 	u32				tcpm_lock;
-	u32				tcpm_vals[TCP_METRIC_MAX];
+	u32				tcpm_vals[TCP_METRIC_MAX + 1];
 	struct tcp_fastopen_metrics	tcpm_fastopen;
+
+	struct rcu_head			rcu_head;
 };
 
 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -690,6 +683,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 	rcu_read_unlock();
 }
 
+static struct genl_family tcp_metrics_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= TCP_METRICS_GENL_NAME,
+	.version	= TCP_METRICS_GENL_VERSION,
+	.maxattr	= TCP_METRICS_ATTR_MAX,
+	.netnsok	= true,
+};
+
+static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
+	[TCP_METRICS_ATTR_ADDR_IPV6]	= { .type = NLA_BINARY,
+					    .len = sizeof(struct in6_addr), },
+	/* Following attributes are not received for GET/DEL,
+	 * we keep them for reference
+	 */
+#if 0
+	[TCP_METRICS_ATTR_AGE]		= { .type = NLA_MSECS, },
+	[TCP_METRICS_ATTR_TW_TSVAL]	= { .type = NLA_U32, },
+	[TCP_METRICS_ATTR_TW_TS_STAMP]	= { .type = NLA_S32, },
+	[TCP_METRICS_ATTR_VALS]		= { .type = NLA_NESTED, },
+	[TCP_METRICS_ATTR_FOPEN_MSS]	= { .type = NLA_U16, },
+	[TCP_METRICS_ATTR_FOPEN_SYN_DROPS]	= { .type = NLA_U16, },
+	[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]	= { .type = NLA_MSECS, },
+	[TCP_METRICS_ATTR_FOPEN_COOKIE]	= { .type = NLA_BINARY,
+					    .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+				 struct tcp_metrics_block *tm)
+{
+	struct nlattr *nest;
+	int i;
+
+	switch (tm->tcpm_addr.family) {
+	case AF_INET:
+		if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+				tm->tcpm_addr.addr.a4) < 0)
+			goto nla_put_failure;
+		break;
+	case AF_INET6:
+		if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
+			    tm->tcpm_addr.addr.a6) < 0)
+			goto nla_put_failure;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+			  jiffies - tm->tcpm_stamp) < 0)
+		goto nla_put_failure;
+	if (tm->tcpm_ts_stamp) {
+		if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
+				(s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
+			goto nla_put_failure;
+		if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
+				tm->tcpm_ts) < 0)
+			goto nla_put_failure;
+	}
+
+	{
+		int n = 0;
+
+		nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+		if (!nest)
+			goto nla_put_failure;
+		for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
+			if (!tm->tcpm_vals[i])
+				continue;
+			if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
+				goto nla_put_failure;
+			n++;
+		}
+		if (n)
+			nla_nest_end(msg, nest);
+		else
+			nla_nest_cancel(msg, nest);
+	}
+
+	{
+		struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&fastopen_seqlock);
+			tfom_copy[0] = tm->tcpm_fastopen;
+		} while (read_seqretry(&fastopen_seqlock, seq));
+
+		tfom = tfom_copy;
+		if (tfom->mss &&
+		    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+				tfom->mss) < 0)
+			goto nla_put_failure;
+		if (tfom->syn_loss &&
+		    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+				tfom->syn_loss) < 0 ||
+		     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+				jiffies - tfom->last_syn_loss) < 0))
+			goto nla_put_failure;
+		if (tfom->cookie.len > 0 &&
+		    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+			    tfom->cookie.len, tfom->cookie.val) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+				 struct netlink_callback *cb,
+				 struct tcp_metrics_block *tm)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &tcp_metrics_nl_family, NLM_F_MULTI,
+			  TCP_METRICS_CMD_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (tcp_metrics_fill_info(skb, tm) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+			       struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+	unsigned int row, s_row = cb->args[0];
+	int s_col = cb->args[1], col = s_col;
+
+	for (row = s_row; row < max_rows; row++, s_col = 0) {
+		struct tcp_metrics_block *tm;
+		struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
+
+		rcu_read_lock();
+		for (col = 0, tm = rcu_dereference(hb->chain); tm;
+		     tm = rcu_dereference(tm->tcpm_next), col++) {
+			if (col < s_col)
+				continue;
+			if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+done:
+	cb->args[0] = row;
+	cb->args[1] = col;
+	return skb->len;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+			 unsigned int *hash, int optional)
+{
+	struct nlattr *a;
+
+	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
+	if (a) {
+		addr->family = AF_INET;
+		addr->addr.a4 = nla_get_be32(a);
+		*hash = (__force unsigned int) addr->addr.a4;
+		return 0;
+	}
+	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+	if (a) {
+		if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
+			return -EINVAL;
+		addr->family = AF_INET6;
+		memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
+		*hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
+		return 0;
+	}
+	return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct sk_buff *msg;
+	struct net *net = genl_info_net(info);
+	void *reply;
+	int ret;
+
+	ret = parse_nl_addr(info, &addr, &hash, 0);
+	if (ret < 0)
+		return ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+				  info->genlhdr->cmd);
+	if (!reply)
+		goto nla_put_failure;
+
+	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+	ret = -ESRCH;
+	rcu_read_lock();
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, &addr)) {
+			ret = tcp_metrics_fill_info(msg, tm);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out_free;
+
+	genlmsg_end(msg, reply);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	ret = -EMSGSIZE;
+
+out_free:
+	nlmsg_free(msg);
+	return ret;
+}
+
+#define deref_locked_genl(p)	\
+	rcu_dereference_protected(p, lockdep_genl_is_held() && \
+				     lockdep_is_held(&tcp_metrics_lock))
+
+#define deref_genl(p)	rcu_dereference_protected(p, lockdep_genl_is_held())
+
+static int tcp_metrics_flush_all(struct net *net)
+{
+	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+	struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
+	struct tcp_metrics_block *tm;
+	unsigned int row;
+
+	for (row = 0; row < max_rows; row++, hb++) {
+		spin_lock_bh(&tcp_metrics_lock);
+		tm = deref_locked_genl(hb->chain);
+		if (tm)
+			hb->chain = NULL;
+		spin_unlock_bh(&tcp_metrics_lock);
+		while (tm) {
+			struct tcp_metrics_block *next;
+
+			next = deref_genl(tm->tcpm_next);
+			kfree_rcu(tm, rcu_head);
+			tm = next;
+		}
+	}
+	return 0;
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct tcpm_hash_bucket *hb;
+	struct tcp_metrics_block *tm;
+	struct tcp_metrics_block __rcu **pp;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net = genl_info_net(info);
+	int ret;
+
+	ret = parse_nl_addr(info, &addr, &hash, 1);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		return tcp_metrics_flush_all(net);
+
+	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+	hb = net->ipv4.tcp_metrics_hash + hash;
+	pp = &hb->chain;
+	spin_lock_bh(&tcp_metrics_lock);
+	for (tm = deref_locked_genl(*pp); tm;
+	     pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
+		if (addr_same(&tm->tcpm_addr, &addr)) {
+			*pp = tm->tcpm_next;
+			break;
+		}
+	}
+	spin_unlock_bh(&tcp_metrics_lock);
+	if (!tm)
+		return -ESRCH;
+	kfree_rcu(tm, rcu_head);
+	return 0;
+}
+
+static struct genl_ops tcp_metrics_nl_ops[] = {
+	{
+		.cmd = TCP_METRICS_CMD_GET,
+		.doit = tcp_metrics_nl_cmd_get,
+		.dumpit = tcp_metrics_nl_dump,
+		.policy = tcp_metrics_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = TCP_METRICS_CMD_DEL,
+		.doit = tcp_metrics_nl_cmd_del,
+		.policy = tcp_metrics_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
 static unsigned int tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
@@ -753,5 +1065,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
 
 void __init tcp_metrics_init(void)
 {
-	register_pernet_subsys(&tcp_net_metrics_ops);
+	int ret;
+
+	ret = register_pernet_subsys(&tcp_net_metrics_ops);
+	if (ret < 0)
+		goto cleanup;
+	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
+					    tcp_metrics_nl_ops,
+					    ARRAY_SIZE(tcp_metrics_nl_ops));
+	if (ret < 0)
+		goto cleanup_subsys;
+	return;
+
+cleanup_subsys:
+	unregister_pernet_subsys(&tcp_net_metrics_ops);
+
+cleanup:
+	return;
 }
-- 
cgit v1.2.1


From 7ab4551f3b391818e29263279031dca1e26417c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Sep 2012 08:07:13 +0000
Subject: tcp: fix TFO regression

Fengguang Wu reported various panics and bisected to commit
8336886f786fdac (tcp: TCP Fast Open Server - support TFO listeners)

Fix this by making sure socket is a TCP socket before accessing TFO data
structures.

[  233.046014] kfree_debugcheck: out of range ptr ea6000000bb8h.
[  233.047399] ------------[ cut here ]------------
[  233.048393] kernel BUG at /c/kernel-tests/src/stable/mm/slab.c:3074!
[  233.048393] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[  233.048393] Modules linked in:
[  233.048393] CPU 0
[  233.048393] Pid: 3929, comm: trinity-watchdo Not tainted 3.6.0-rc3+
#4192 Bochs Bochs
[  233.048393] RIP: 0010:[<ffffffff81169653>]  [<ffffffff81169653>]
kfree_debugcheck+0x27/0x2d
[  233.048393] RSP: 0018:ffff88000facbca8  EFLAGS: 00010092
[  233.048393] RAX: 0000000000000031 RBX: 0000ea6000000bb8 RCX:
00000000a189a188
[  233.048393] RDX: 000000000000a189 RSI: ffffffff8108ad32 RDI:
ffffffff810d30f9
[  233.048393] RBP: ffff88000facbcb8 R08: 0000000000000002 R09:
ffffffff843846f0
[  233.048393] R10: ffffffff810ae37c R11: 0000000000000908 R12:
0000000000000202
[  233.048393] R13: ffffffff823dbd5a R14: ffff88000ec5bea8 R15:
ffffffff8363c780
[  233.048393] FS:  00007faa6899c700(0000) GS:ffff88001f200000(0000)
knlGS:0000000000000000
[  233.048393] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  233.048393] CR2: 00007faa6841019c CR3: 0000000012c82000 CR4:
00000000000006f0
[  233.048393] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  233.048393] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[  233.048393] Process trinity-watchdo (pid: 3929, threadinfo
ffff88000faca000, task ffff88000faec600)
[  233.048393] Stack:
[  233.048393]  0000000000000000 0000ea6000000bb8 ffff88000facbce8
ffffffff8116ad81
[  233.048393]  ffff88000ff588a0 ffff88000ff58850 ffff88000ff588a0
0000000000000000
[  233.048393]  ffff88000facbd08 ffffffff823dbd5a ffffffff823dbcb0
ffff88000ff58850
[  233.048393] Call Trace:
[  233.048393]  [<ffffffff8116ad81>] kfree+0x5f/0xca
[  233.048393]  [<ffffffff823dbd5a>] inet_sock_destruct+0xaa/0x13c
[  233.048393]  [<ffffffff823dbcb0>] ? inet_sk_rebuild_header
+0x319/0x319
[  233.048393]  [<ffffffff8231c307>] __sk_free+0x21/0x14b
[  233.048393]  [<ffffffff8231c4bd>] sk_free+0x26/0x2a
[  233.048393]  [<ffffffff825372db>] sctp_close+0x215/0x224
[  233.048393]  [<ffffffff810d6835>] ? lock_release+0x16f/0x1b9
[  233.048393]  [<ffffffff823daf12>] inet_release+0x7e/0x85
[  233.048393]  [<ffffffff82317d15>] sock_release+0x1f/0x77
[  233.048393]  [<ffffffff82317d94>] sock_close+0x27/0x2b
[  233.048393]  [<ffffffff81173bbe>] __fput+0x101/0x20a
[  233.048393]  [<ffffffff81173cd5>] ____fput+0xe/0x10
[  233.048393]  [<ffffffff810a3794>] task_work_run+0x5d/0x75
[  233.048393]  [<ffffffff8108da70>] do_exit+0x290/0x7f5
[  233.048393]  [<ffffffff82707415>] ? retint_swapgs+0x13/0x1b
[  233.048393]  [<ffffffff8108e23f>] do_group_exit+0x7b/0xba
[  233.048393]  [<ffffffff8108e295>] sys_exit_group+0x17/0x17
[  233.048393]  [<ffffffff8270de10>] tracesys+0xdd/0xe2
[  233.048393] Code: 59 01 5d c3 55 48 89 e5 53 41 50 0f 1f 44 00 00 48
89 fb e8 d4 b0 f0 ff 84 c0 75 11 48 89 de 48 c7 c7 fc fa f7 82 e8 0d 0f
57 01 <0f> 0b 5f 5b 5d c3 55 48 89 e5 0f 1f 44 00 00 48 63 87 d8 00 00
[  233.048393] RIP  [<ffffffff81169653>] kfree_debugcheck+0x27/0x2d
[  233.048393]  RSP <ffff88000facbca8>

Reported-by: Fengguang Wu <wfg@linux.intel.com>
Tested-by: Fengguang Wu <wfg@linux.intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: "H.K. Jerry Chu" <hkchu@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: H.K. Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c              | 7 ++-----
 net/ipv4/inet_connection_sock.c | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4f70ef0b946d..845372b025f6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,11 +149,8 @@ void inet_sock_destruct(struct sock *sk)
 		pr_err("Attempt to release alive inet socket %p\n", sk);
 		return;
 	}
-	if (sk->sk_type == SOCK_STREAM) {
-		struct fastopen_queue *fastopenq =
-			inet_csk(sk)->icsk_accept_queue.fastopenq;
-		kfree(fastopenq);
-	}
+	if (sk->sk_protocol == IPPROTO_TCP)
+		kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
 
 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8464b79c493f..f0c5b9c1a957 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -314,7 +314,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 	newsk = req->sk;
 
 	sk_acceptq_removed(sk);
-	if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
+	if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
 		spin_lock_bh(&queue->fastopenq->lock);
 		if (tcp_rsk(req)->listener) {
 			/* We are still waiting for the final ACK from 3WHS
@@ -775,7 +775,7 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		percpu_counter_inc(sk->sk_prot->orphan_count);
 
-		if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
+		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
 			BUG_ON(sk != tcp_rsk(req)->listener);
 
-- 
cgit v1.2.1


From 979402b16cde048ced4839e21cc49e0779352b80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 5 Sep 2012 23:34:44 +0000
Subject: udp: increment UDP_MIB_INERRORS if copy failed

In UDP recvmsg(), we miss an increase of UDP_MIB_INERRORS if the copy
of skb to userspace failed for whatever reason.

Reported-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6f6d1aca3c3d..2814f66dac64 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1226,6 +1226,11 @@ try_again:
 
 	if (unlikely(err)) {
 		trace_kfree_skb(skb, udp_recvmsg);
+		if (!peeked) {
+			atomic_inc(&sk->sk_drops);
+			UDP_INC_STATS_USER(sock_net(sk),
+					   UDP_MIB_INERRORS, is_udplite);
+		}
 		goto out_free;
 	}
 
-- 
cgit v1.2.1


From d679c5324d9a87c6295f56c2dea52d5f68834f41 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Sep 2012 20:37:06 +0000
Subject: igmp: avoid drop_monitor false positives

igmp should call consume_skb() for all correctly processed packets,
to avoid false dropwatch/drop_monitor false positives.

Reported-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 0b5580c69f2d..3479b98a00a7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -815,14 +815,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
 	return 1;
 }
 
-static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 {
 	struct ip_mc_list *im;
 
 	/* Timers are only set for non-local groups */
 
 	if (group == IGMP_ALL_HOSTS)
-		return;
+		return false;
 
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, im) {
@@ -832,9 +833,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
 		}
 	}
 	rcu_read_unlock();
+	return false;
 }
 
-static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 	int len)
 {
 	struct igmphdr 		*ih = igmp_hdr(skb);
@@ -866,7 +869,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		/* clear deleted report items */
 		igmpv3_clear_delrec(in_dev);
 	} else if (len < 12) {
-		return;	/* ignore bogus packet; freed by caller */
+		return true;	/* ignore bogus packet; freed by caller */
 	} else if (IGMP_V1_SEEN(in_dev)) {
 		/* This is a v3 query with v1 queriers present */
 		max_delay = IGMP_Query_Response_Interval;
@@ -883,13 +886,13 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 			max_delay = 1;	/* can't mod w/ 0 */
 	} else { /* v3 */
 		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
-			return;
+			return true;
 
 		ih3 = igmpv3_query_hdr(skb);
 		if (ih3->nsrcs) {
 			if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
 					   + ntohs(ih3->nsrcs)*sizeof(__be32)))
-				return;
+				return true;
 			ih3 = igmpv3_query_hdr(skb);
 		}
 
@@ -901,9 +904,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 			in_dev->mr_qrv = ih3->qrv;
 		if (!group) { /* general query */
 			if (ih3->nsrcs)
-				return;	/* no sources allowed */
+				return false;	/* no sources allowed */
 			igmp_gq_start_timer(in_dev);
-			return;
+			return false;
 		}
 		/* mark sources to include, if group & source-specific */
 		mark = ih3->nsrcs != 0;
@@ -939,6 +942,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 			igmp_mod_timer(im, max_delay);
 	}
 	rcu_read_unlock();
+	return false;
 }
 
 /* called in rcu_read_lock() section */
@@ -948,6 +952,7 @@ int igmp_rcv(struct sk_buff *skb)
 	struct igmphdr *ih;
 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 	int len = skb->len;
+	bool dropped = true;
 
 	if (in_dev == NULL)
 		goto drop;
@@ -969,7 +974,7 @@ int igmp_rcv(struct sk_buff *skb)
 	ih = igmp_hdr(skb);
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
-		igmp_heard_query(in_dev, skb, len);
+		dropped = igmp_heard_query(in_dev, skb, len);
 		break;
 	case IGMP_HOST_MEMBERSHIP_REPORT:
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -979,7 +984,7 @@ int igmp_rcv(struct sk_buff *skb)
 		/* don't rely on MC router hearing unicast reports */
 		if (skb->pkt_type == PACKET_MULTICAST ||
 		    skb->pkt_type == PACKET_BROADCAST)
-			igmp_heard_report(in_dev, ih->group);
+			dropped = igmp_heard_report(in_dev, ih->group);
 		break;
 	case IGMP_PIM:
 #ifdef CONFIG_IP_PIMSM_V1
@@ -997,7 +1002,10 @@ int igmp_rcv(struct sk_buff *skb)
 	}
 
 drop:
-	kfree_skb(skb);
+	if (dropped)
+		kfree_skb(skb);
+	else
+		consume_skb(skb);
 	return 0;
 }
 
-- 
cgit v1.2.1


From 4ccfe6d4109252dfadcd6885f33ed600ee03dbf8 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 7 Sep 2012 00:45:29 +0000
Subject: ipv4/route: arg delay is useless in rt_cache_flush()

Since route cache deletion (89aef8921bfbac22f), delay is no
more used. Remove it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c          |  2 +-
 net/ipv4/devinet.c      |  6 +++---
 net/ipv4/fib_frontend.c | 20 ++++++++++----------
 net/ipv4/fib_rules.c    |  2 +-
 net/ipv4/fib_trie.c     |  6 +++---
 net/ipv4/route.c        | 19 +++----------------
 6 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 77e87aff419a..47800459e4cb 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_CHANGEADDR:
 		neigh_changeaddr(&arp_tbl, dev);
-		rt_cache_flush(dev_net(dev), 0);
+		rt_cache_flush(dev_net(dev));
 		break;
 	default:
 		break;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index adf273f8ad2e..f14eff506743 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1500,7 +1500,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
 		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
 			if ((new_value == 0) && (old_value != 0))
-				rt_cache_flush(net, 0);
+				rt_cache_flush(net);
 	}
 
 	return ret;
@@ -1534,7 +1534,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 				dev_disable_lro(idev->dev);
 			}
 			rtnl_unlock();
-			rt_cache_flush(net, 0);
+			rt_cache_flush(net);
 		}
 	}
 
@@ -1551,7 +1551,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 	struct net *net = ctl->extra2;
 
 	if (write && *valp != val)
-		rt_cache_flush(net, 0);
+		rt_cache_flush(net);
 
 	return ret;
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index acdee325d972..8a7cd795a96e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -148,7 +148,7 @@ static void fib_flush(struct net *net)
 	}
 
 	if (flushed)
-		rt_cache_flush(net, -1);
+		rt_cache_flush(net);
 }
 
 /*
@@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net)
 	net->ipv4.fibnl = NULL;
 }
 
-static void fib_disable_ip(struct net_device *dev, int force, int delay)
+static void fib_disable_ip(struct net_device *dev, int force)
 {
 	if (fib_sync_down_dev(dev, force))
 		fib_flush(dev_net(dev));
-	rt_cache_flush(dev_net(dev), delay);
+	rt_cache_flush(dev_net(dev));
 	arp_ifdown(dev);
 }
 
@@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		fib_sync_up(dev);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
-		rt_cache_flush(dev_net(dev), -1);
+		rt_cache_flush(dev_net(dev));
 		break;
 	case NETDEV_DOWN:
 		fib_del_ifaddr(ifa, NULL);
@@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 			/* Last address was deleted from this interface.
 			 * Disable IP.
 			 */
-			fib_disable_ip(dev, 1, 0);
+			fib_disable_ip(dev, 1);
 		} else {
-			rt_cache_flush(dev_net(dev), -1);
+			rt_cache_flush(dev_net(dev));
 		}
 		break;
 	}
@@ -1045,7 +1045,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_UNREGISTER) {
-		fib_disable_ip(dev, 2, -1);
+		fib_disable_ip(dev, 2);
 		rt_flush_dev(dev);
 		return NOTIFY_DONE;
 	}
@@ -1061,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		fib_sync_up(dev);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
-		rt_cache_flush(net, -1);
+		rt_cache_flush(net);
 		break;
 	case NETDEV_DOWN:
-		fib_disable_ip(dev, 0, 0);
+		fib_disable_ip(dev, 0);
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
-		rt_cache_flush(net, 0);
+		rt_cache_flush(net);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a83d74e498d2..274309d3aded 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -259,7 +259,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 
 static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 {
-	rt_cache_flush(ops->fro_net, -1);
+	rt_cache_flush(ops->fro_net);
 }
 
 static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3c820dae235e..8d766106b540 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1286,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 
 			fib_release_info(fi_drop);
 			if (state & FA_S_ACCESSED)
-				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 
@@ -1333,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	list_add_tail_rcu(&new_fa->fa_list,
 			  (fa ? &fa->fa_list : fa_head));
 
-	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+	rt_cache_flush(cfg->fc_nlinfo.nl_net);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 succeeded:
@@ -1711,7 +1711,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 		trie_leaf_remove(t, l);
 
 	if (fa->fa_state & FA_S_ACCESSED)
-		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+		rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
 	fib_release_info(fa->fa_info);
 	alias_free_mem_rcu(fa);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dc9549b5eb1c..ab1e7c7c38fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -461,11 +461,7 @@ static void rt_cache_invalidate(struct net *net)
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 }
 
-/*
- * delay < 0  : invalidate cache (fast : entries will be deleted later)
- * delay >= 0 : invalidate & flush cache (can be long)
- */
-void rt_cache_flush(struct net *net, int delay)
+void rt_cache_flush(struct net *net)
 {
 	rt_cache_invalidate(net);
 }
@@ -2345,7 +2341,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 
 void ip_rt_multicast_event(struct in_device *in_dev)
 {
-	rt_cache_flush(dev_net(in_dev->dev), 0);
+	rt_cache_flush(dev_net(in_dev->dev));
 }
 
 #ifdef CONFIG_SYSCTL
@@ -2354,16 +2350,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 					size_t *lenp, loff_t *ppos)
 {
 	if (write) {
-		int flush_delay;
-		ctl_table ctl;
-		struct net *net;
-
-		memcpy(&ctl, __ctl, sizeof(ctl));
-		ctl.data = &flush_delay;
-		proc_dointvec(&ctl, write, buffer, lenp, ppos);
-
-		net = (struct net *)__ctl->extra1;
-		rt_cache_flush(net, flush_delay);
+		rt_cache_flush((struct net *)__ctl->extra1);
 		return 0;
 	}
 
-- 
cgit v1.2.1


From ba8bd0ea982427258ab86144ac5fd1a9d7763a90 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 7 Sep 2012 22:27:11 +0200
Subject: net: rt_cache_flush() cleanup

We dont use jhash anymore since route cache removal,
so we can get rid of get_random_bytes() calls for rt_genid
changes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ab1e7c7c38fc..d39edf16d607 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -447,23 +447,9 @@ static inline bool rt_is_expired(const struct rtable *rth)
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
-/*
- * Perturbation of rt_genid by a small quantity [1..256]
- * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
- * many times (2^24) without giving recent rt_genid.
- * Jenkins hash is strong enough that litle changes of rt_genid are OK.
- */
-static void rt_cache_invalidate(struct net *net)
-{
-	unsigned char shuffle;
-
-	get_random_bytes(&shuffle, sizeof(shuffle));
-	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-}
-
 void rt_cache_flush(struct net *net)
 {
-	rt_cache_invalidate(net);
+	atomic_inc(&net->ipv4.rt_genid);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -2520,8 +2506,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-	get_random_bytes(&net->ipv4.rt_genid,
-			 sizeof(net->ipv4.rt_genid));
+	atomic_set(&net->ipv4.rt_genid, 0);
 	get_random_bytes(&net->ipv4.dev_addr_genid,
 			 sizeof(net->ipv4.dev_addr_genid));
 	return 0;
-- 
cgit v1.2.1


From 9f00d9776bc5beb92e8bfc884a7e96ddc5589e2e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 8 Sep 2012 02:53:54 +0000
Subject: netlink: hide struct module parameter in netlink_kernel_create

This patch defines netlink_kernel_create as a wrapper function of
__netlink_kernel_create to hide the struct module *me parameter
(which seems to be THIS_MODULE in all existing netlink subsystems).

Suggested by David S. Miller.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c       | 2 +-
 net/ipv4/netfilter/ipt_ULOG.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8a7cd795a96e..dc1f10ed1872 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -986,7 +986,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
 		.input	= nl_fib_input,
 	};
 
-	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
 	if (sk == NULL)
 		return -EAFNOSUPPORT;
 	net->ipv4.fibnl = sk;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 1109f7f6c254..b5ef3cba2250 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -396,8 +396,7 @@ static int __init ulog_tg_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-					THIS_MODULE, &cfg);
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
 	if (!nflognl)
 		return -ENOMEM;
 
-- 
cgit v1.2.1


From 15e473046cb6e5d18a4d0057e61d76315230382b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 7 Sep 2012 20:12:54 +0000
Subject: netlink: Rename pid to portid to avoid confusion

It is a frequent mistake to confuse the netlink port identifier with a
process identifier.  Try to reduce this confusion by renaming fields
that hold port identifiers portid instead of pid.

I have carefully avoided changing the structures exported to
userspace to avoid changing the userspace API.

I have successfully built an allyesconfig kernel with this change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c       | 28 ++++++++++++++--------------
 net/ipv4/fib_frontend.c  | 10 +++++-----
 net/ipv4/fib_semantics.c |  8 ++++----
 net/ipv4/fib_trie.c      |  2 +-
 net/ipv4/inet_diag.c     | 32 ++++++++++++++++----------------
 net/ipv4/ipmr.c          | 10 +++++-----
 net/ipv4/route.c         |  8 ++++----
 net/ipv4/tcp_metrics.c   |  2 +-
 net/ipv4/udp_diag.c      |  6 +++---
 9 files changed, 53 insertions(+), 53 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index f14eff506743..7b00556e184b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -311,7 +311,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 }
 
 static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-			 int destroy, struct nlmsghdr *nlh, u32 pid)
+			 int destroy, struct nlmsghdr *nlh, u32 portid)
 {
 	struct in_ifaddr *promote = NULL;
 	struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -345,7 +345,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 				inet_hash_remove(ifa);
 				*ifap1 = ifa->ifa_next;
 
-				rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+				rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
 				blocking_notifier_call_chain(&inetaddr_chain,
 						NETDEV_DOWN, ifa);
 				inet_free_ifa(ifa);
@@ -382,7 +382,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	   is valid, it will try to restore deleted routes... Grr.
 	   So that, this order is correct.
 	 */
-	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
@@ -395,7 +395,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 		}
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
-		rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+		rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
 		blocking_notifier_call_chain(&inetaddr_chain,
 				NETDEV_UP, promote);
 		for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -417,7 +417,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 }
 
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-			     u32 pid)
+			     u32 portid)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
 	struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -464,7 +464,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	/* Send message first, then call notifier.
 	   Notifier will trigger FIB update, so that
 	   listeners of netlink will know about new ifaddr */
-	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
@@ -563,7 +563,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
 		    !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
 			continue;
 
-		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
 		return 0;
 	}
 
@@ -649,7 +649,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
 	if (IS_ERR(ifa))
 		return PTR_ERR(ifa);
 
-	return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+	return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
 }
 
 /*
@@ -1246,12 +1246,12 @@ static size_t inet_nlmsg_size(void)
 }
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-			    u32 pid, u32 seq, int event, unsigned int flags)
+			    u32 portid, u32 seq, int event, unsigned int flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
 
-	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
 	if (nlh == NULL)
 		return -EMSGSIZE;
 
@@ -1313,7 +1313,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 				if (ip_idx < s_ip_idx)
 					continue;
 				if (inet_fill_ifaddr(skb, ifa,
-					     NETLINK_CB(cb->skb).pid,
+					     NETLINK_CB(cb->skb).portid,
 					     cb->nlh->nlmsg_seq,
 					     RTM_NEWADDR, NLM_F_MULTI) <= 0) {
 					rcu_read_unlock();
@@ -1335,7 +1335,7 @@ done:
 }
 
 static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-		      u32 pid)
+		      u32 portid)
 {
 	struct sk_buff *skb;
 	u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1347,14 +1347,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	if (skb == NULL)
 		goto errout;
 
-	err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(skb);
 		goto errout;
 	}
-	rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
 	return;
 errout:
 	if (err < 0)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index dc1f10ed1872..68c93d1bb03a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -557,7 +557,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 	cfg->fc_flags = rtm->rtm_flags;
 	cfg->fc_nlflags = nlh->nlmsg_flags;
 
-	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 	cfg->fc_nlinfo.nlh = nlh;
 	cfg->fc_nlinfo.nl_net = net;
 
@@ -955,7 +955,7 @@ static void nl_fib_input(struct sk_buff *skb)
 	struct fib_result_nl *frn;
 	struct nlmsghdr *nlh;
 	struct fib_table *tb;
-	u32 pid;
+	u32 portid;
 
 	net = sock_net(skb->sk);
 	nlh = nlmsg_hdr(skb);
@@ -973,10 +973,10 @@ static void nl_fib_input(struct sk_buff *skb)
 
 	nl_fib_lookup(frn, tb);
 
-	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
-	NETLINK_CB(skb).pid = 0;        /* from kernel */
+	portid = NETLINK_CB(skb).portid;      /* pid of sending process */
+	NETLINK_CB(skb).portid = 0;        /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
-	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+	netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
 }
 
 static int __net_init nl_fib_lookup_init(struct net *net)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da80dc14cc76..3509065e409a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -391,7 +391,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 	if (skb == NULL)
 		goto errout;
 
-	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
+	err = fib_dump_info(skb, info->portid, seq, event, tb_id,
 			    fa->fa_type, key, dst_len,
 			    fa->fa_tos, fa->fa_info, nlm_flags);
 	if (err < 0) {
@@ -400,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 		kfree_skb(skb);
 		goto errout;
 	}
-	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
 		    info->nlh, GFP_KERNEL);
 	return;
 errout:
@@ -989,14 +989,14 @@ failure:
 	return ERR_PTR(err);
 }
 
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
 		  struct fib_info *fi, unsigned int flags)
 {
 	struct nlmsghdr *nlh;
 	struct rtmsg *rtm;
 
-	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
 	if (nlh == NULL)
 		return -EMSGSIZE;
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 8d766106b540..31d771ca9a70 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1873,7 +1873,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
 			continue;
 		}
 
-		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				  cb->nlh->nlmsg_seq,
 				  RTM_NEWROUTE,
 				  tb->tb_id,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8bc005b1435f..535584c00f91 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -70,7 +70,7 @@ static inline void inet_diag_unlock_handler(
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			      struct sk_buff *skb, struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,		      	
-			      u32 pid, u32 seq, u16 nlmsg_flags,
+			      u32 portid, u32 seq, u16 nlmsg_flags,
 			      const struct nlmsghdr *unlh)
 {
 	const struct inet_sock *inet = inet_sk(sk);
@@ -84,7 +84,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	handler = inet_diag_table[req->sdiag_protocol];
 	BUG_ON(handler == NULL);
 
-	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
 			nlmsg_flags);
 	if (!nlh)
 		return -EMSGSIZE;
@@ -201,23 +201,23 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
 static int inet_csk_diag_fill(struct sock *sk,
 			      struct sk_buff *skb, struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
-			      u32 pid, u32 seq, u16 nlmsg_flags,
+			      u32 portid, u32 seq, u16 nlmsg_flags,
 			      const struct nlmsghdr *unlh)
 {
 	return inet_sk_diag_fill(sk, inet_csk(sk),
-			skb, req, user_ns, pid, seq, nlmsg_flags, unlh);
+			skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
 }
 
 static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 			       struct sk_buff *skb, struct inet_diag_req_v2 *req,
-			       u32 pid, u32 seq, u16 nlmsg_flags,
+			       u32 portid, u32 seq, u16 nlmsg_flags,
 			       const struct nlmsghdr *unlh)
 {
 	long tmo;
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 
-	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
 			nlmsg_flags);
 	if (!nlh)
 		return -EMSGSIZE;
@@ -260,14 +260,14 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			struct inet_diag_req_v2 *r,
 			struct user_namespace *user_ns,
-			u32 pid, u32 seq, u16 nlmsg_flags,
+			u32 portid, u32 seq, u16 nlmsg_flags,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
-					   skb, r, pid, seq, nlmsg_flags,
+					   skb, r, portid, seq, nlmsg_flags,
 					   unlh);
-	return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh);
+	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
 }
 
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -316,14 +316,14 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).ssk),
-			   NETLINK_CB(in_skb).pid,
+			   NETLINK_CB(in_skb).portid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
 		goto out;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -557,7 +557,7 @@ static int inet_csk_diag_dump(struct sock *sk,
 
 	return inet_csk_diag_fill(sk, skb, r,
 				  sk_user_ns(NETLINK_CB(cb->skb).ssk),
-				  NETLINK_CB(cb->skb).pid,
+				  NETLINK_CB(cb->skb).portid,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 
@@ -592,14 +592,14 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
 	}
 
 	return inet_twsk_diag_fill(tw, skb, r,
-				   NETLINK_CB(cb->skb).pid,
+				   NETLINK_CB(cb->skb).portid,
 				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 
 static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 			      struct request_sock *req,
 			      struct user_namespace *user_ns,
-			      u32 pid, u32 seq,
+			      u32 portid, u32 seq,
 			      const struct nlmsghdr *unlh)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -608,7 +608,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	struct nlmsghdr *nlh;
 	long tmo;
 
-	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
 			NLM_F_MULTI);
 	if (!nlh)
 		return -EMSGSIZE;
@@ -711,7 +711,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 
 			err = inet_diag_fill_req(skb, sk, req,
 					       sk_user_ns(NETLINK_CB(cb->skb).ssk),
-					       NETLINK_CB(cb->skb).pid,
+					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, cb->nlh);
 			if (err < 0) {
 				cb->args[3] = j + 1;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8aa7a4cf9139..1daa95c2a0ba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -626,7 +626,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 			e->error = -ETIMEDOUT;
 			memset(&e->msg, 0, sizeof(e->msg));
 
-			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
 		} else {
 			kfree_skb(skb);
 		}
@@ -870,7 +870,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 				memset(&e->msg, 0, sizeof(e->msg));
 			}
 
-			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
 		} else {
 			ip_mr_forward(net, mrt, skb, c, 0);
 		}
@@ -2117,12 +2117,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 }
 
 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			    u32 pid, u32 seq, struct mfc_cache *c)
+			    u32 portid, u32 seq, struct mfc_cache *c)
 {
 	struct nlmsghdr *nlh;
 	struct rtmsg *rtm;
 
-	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
 	if (nlh == NULL)
 		return -EMSGSIZE;
 
@@ -2176,7 +2176,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 				if (e < s_e)
 					goto next_entry;
 				if (ipmr_fill_mroute(mrt, skb,
-						     NETLINK_CB(cb->skb).pid,
+						     NETLINK_CB(cb->skb).portid,
 						     cb->nlh->nlmsg_seq,
 						     mfc) < 0)
 					goto done;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d39edf16d607..940f4f4cb201 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2136,7 +2136,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
 			u32 seq, int event, int nowait, unsigned int flags)
 {
 	struct rtable *rt = skb_rtable(skb);
@@ -2146,7 +2146,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 	u32 error;
 	u32 metrics[RTAX_MAX];
 
-	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
 	if (nlh == NULL)
 		return -EMSGSIZE;
 
@@ -2306,12 +2306,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		rt->rt_flags |= RTCF_NOTIFY;
 
 	err = rt_fill_info(net, dst, src, &fl4, skb,
-			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
 
-	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 errout:
 	return err;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 988edb63ee73..4c752a6e0bcd 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -803,7 +803,7 @@ static int tcp_metrics_dump_info(struct sk_buff *skb,
 {
 	void *hdr;
 
-	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			  &tcp_metrics_nl_family, NLM_F_MULTI,
 			  TCP_METRICS_CMD_GET);
 	if (!hdr)
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d2f336ea82ca..505b30ad9182 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -26,7 +26,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 
 	return inet_sk_diag_fill(sk, NULL, skb, req,
 			sk_user_ns(NETLINK_CB(cb->skb).ssk),
-			NETLINK_CB(cb->skb).pid,
+			NETLINK_CB(cb->skb).portid,
 			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 
@@ -72,14 +72,14 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).ssk),
-			   NETLINK_CB(in_skb).pid,
+			   NETLINK_CB(in_skb).portid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
 		goto out;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
-- 
cgit v1.2.1


From bafa6d9d89072c1a18853afe9ee5de05c491c13a Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 7 Sep 2012 00:45:29 +0000
Subject: ipv4/route: arg delay is useless in rt_cache_flush()

Since route cache deletion (89aef8921bfbac22f), delay is no
more used. Remove it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c          |  2 +-
 net/ipv4/devinet.c      |  6 +++---
 net/ipv4/fib_frontend.c | 20 ++++++++++----------
 net/ipv4/fib_rules.c    |  2 +-
 net/ipv4/fib_trie.c     |  6 +++---
 net/ipv4/route.c        | 19 +++----------------
 6 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 77e87aff419a..47800459e4cb 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_CHANGEADDR:
 		neigh_changeaddr(&arp_tbl, dev);
-		rt_cache_flush(dev_net(dev), 0);
+		rt_cache_flush(dev_net(dev));
 		break;
 	default:
 		break;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44bf82e3aef7..9b55b6f5a585 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1503,7 +1503,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
 		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
 			if ((new_value == 0) && (old_value != 0))
-				rt_cache_flush(net, 0);
+				rt_cache_flush(net);
 	}
 
 	return ret;
@@ -1537,7 +1537,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 				dev_disable_lro(idev->dev);
 			}
 			rtnl_unlock();
-			rt_cache_flush(net, 0);
+			rt_cache_flush(net);
 		}
 	}
 
@@ -1554,7 +1554,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 	struct net *net = ctl->extra2;
 
 	if (write && *valp != val)
-		rt_cache_flush(net, 0);
+		rt_cache_flush(net);
 
 	return ret;
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c43ae3fba792..8e2b475da9fa 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -148,7 +148,7 @@ static void fib_flush(struct net *net)
 	}
 
 	if (flushed)
-		rt_cache_flush(net, -1);
+		rt_cache_flush(net);
 }
 
 /*
@@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net)
 	net->ipv4.fibnl = NULL;
 }
 
-static void fib_disable_ip(struct net_device *dev, int force, int delay)
+static void fib_disable_ip(struct net_device *dev, int force)
 {
 	if (fib_sync_down_dev(dev, force))
 		fib_flush(dev_net(dev));
-	rt_cache_flush(dev_net(dev), delay);
+	rt_cache_flush(dev_net(dev));
 	arp_ifdown(dev);
 }
 
@@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		fib_sync_up(dev);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
-		rt_cache_flush(dev_net(dev), -1);
+		rt_cache_flush(dev_net(dev));
 		break;
 	case NETDEV_DOWN:
 		fib_del_ifaddr(ifa, NULL);
@@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 			/* Last address was deleted from this interface.
 			 * Disable IP.
 			 */
-			fib_disable_ip(dev, 1, 0);
+			fib_disable_ip(dev, 1);
 		} else {
-			rt_cache_flush(dev_net(dev), -1);
+			rt_cache_flush(dev_net(dev));
 		}
 		break;
 	}
@@ -1045,7 +1045,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_UNREGISTER) {
-		fib_disable_ip(dev, 2, -1);
+		fib_disable_ip(dev, 2);
 		rt_flush_dev(dev);
 		return NOTIFY_DONE;
 	}
@@ -1062,14 +1062,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		fib_sync_up(dev);
 #endif
 		atomic_inc(&net->ipv4.dev_addr_genid);
-		rt_cache_flush(dev_net(dev), -1);
+		rt_cache_flush(dev_net(dev));
 		break;
 	case NETDEV_DOWN:
-		fib_disable_ip(dev, 0, 0);
+		fib_disable_ip(dev, 0);
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
-		rt_cache_flush(dev_net(dev), 0);
+		rt_cache_flush(dev_net(dev));
 		break;
 	case NETDEV_UNREGISTER_BATCH:
 		break;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a83d74e498d2..274309d3aded 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -259,7 +259,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 
 static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 {
-	rt_cache_flush(ops->fro_net, -1);
+	rt_cache_flush(ops->fro_net);
 }
 
 static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 57bd978483e1..d1b93595b4a7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1286,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 
 			fib_release_info(fi_drop);
 			if (state & FA_S_ACCESSED)
-				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 
@@ -1333,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	list_add_tail_rcu(&new_fa->fa_list,
 			  (fa ? &fa->fa_list : fa_head));
 
-	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+	rt_cache_flush(cfg->fc_nlinfo.nl_net);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 succeeded:
@@ -1708,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 		trie_leaf_remove(t, l);
 
 	if (fa->fa_state & FA_S_ACCESSED)
-		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+		rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
 	fib_release_info(fa->fa_info);
 	alias_free_mem_rcu(fa);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 82cf2a722b23..f6436d3b207a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -461,11 +461,7 @@ static void rt_cache_invalidate(struct net *net)
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 }
 
-/*
- * delay < 0  : invalidate cache (fast : entries will be deleted later)
- * delay >= 0 : invalidate & flush cache (can be long)
- */
-void rt_cache_flush(struct net *net, int delay)
+void rt_cache_flush(struct net *net)
 {
 	rt_cache_invalidate(net);
 }
@@ -2345,7 +2341,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 
 void ip_rt_multicast_event(struct in_device *in_dev)
 {
-	rt_cache_flush(dev_net(in_dev->dev), 0);
+	rt_cache_flush(dev_net(in_dev->dev));
 }
 
 #ifdef CONFIG_SYSCTL
@@ -2354,16 +2350,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 					size_t *lenp, loff_t *ppos)
 {
 	if (write) {
-		int flush_delay;
-		ctl_table ctl;
-		struct net *net;
-
-		memcpy(&ctl, __ctl, sizeof(ctl));
-		ctl.data = &flush_delay;
-		proc_dointvec(&ctl, write, buffer, lenp, ppos);
-
-		net = (struct net *)__ctl->extra1;
-		rt_cache_flush(net, flush_delay);
+		rt_cache_flush((struct net *)__ctl->extra1);
 		return 0;
 	}
 
-- 
cgit v1.2.1


From 2885da72966fcb89f48d554339d347fb02b5ea78 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 7 Sep 2012 22:27:11 +0200
Subject: net: rt_cache_flush() cleanup

We dont use jhash anymore since route cache removal,
so we can get rid of get_random_bytes() calls for rt_genid
changes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6436d3b207a..be27cfa96e88 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -447,23 +447,9 @@ static inline bool rt_is_expired(const struct rtable *rth)
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
-/*
- * Perturbation of rt_genid by a small quantity [1..256]
- * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
- * many times (2^24) without giving recent rt_genid.
- * Jenkins hash is strong enough that litle changes of rt_genid are OK.
- */
-static void rt_cache_invalidate(struct net *net)
-{
-	unsigned char shuffle;
-
-	get_random_bytes(&shuffle, sizeof(shuffle));
-	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-}
-
 void rt_cache_flush(struct net *net)
 {
-	rt_cache_invalidate(net);
+	atomic_inc(&net->ipv4.rt_genid);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -2520,8 +2506,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-	get_random_bytes(&net->ipv4.rt_genid,
-			 sizeof(net->ipv4.rt_genid));
+	atomic_set(&net->ipv4.rt_genid, 0);
 	get_random_bytes(&net->ipv4.dev_addr_genid,
 			 sizeof(net->ipv4.dev_addr_genid));
 	return 0;
-- 
cgit v1.2.1


From b42664f898c976247f7f609b8bb9c94d7475ca10 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 10 Sep 2012 22:09:44 +0000
Subject: netns: move net->ipv4.rt_genid to net->rt_genid

This commit prepares the use of rt_genid by both IPv4 and IPv6.
Initialization is left in IPv4 part.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index be27cfa96e88..fd9af60397b5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -202,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio);
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline int rt_genid(struct net *net)
-{
-	return atomic_read(&net->ipv4.rt_genid);
-}
-
 #ifdef CONFIG_PROC_FS
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
@@ -449,7 +444,7 @@ static inline bool rt_is_expired(const struct rtable *rth)
 
 void rt_cache_flush(struct net *net)
 {
-	atomic_inc(&net->ipv4.rt_genid);
+	rt_genid_bump(net);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -2506,7 +2501,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-	atomic_set(&net->ipv4.rt_genid, 0);
+	atomic_set(&net->rt_genid, 0);
 	get_random_bytes(&net->ipv4.dev_addr_genid,
 			 sizeof(net->ipv4.dev_addr_genid));
 	return 0;
-- 
cgit v1.2.1


From 1d57f19539c074105791da6384a8ad674bba8037 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 17 Sep 2012 12:51:39 +0000
Subject: tcp: fix regression in urgent data handling

Stephan Springl found that commit 1402d366019fed "tcp: introduce
tcp_try_coalesce" introduced a regression for rlogin

It turns out problem comes from TCP urgent data handling and
a change in behavior in input path.

rlogin sends two one-byte packets with URG ptr set, and when next data
frame is coalesced, we lack sk_data_ready() calls to wakeup consumer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stephan Springl <springl-k@lar.bfw.de>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6e38c6c23caa..d377f4854cb8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4661,7 +4661,7 @@ queue_and_out:
 
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
-		else if (!sock_flag(sk, SOCK_DEAD))
+		if (!sock_flag(sk, SOCK_DEAD))
 			sk->sk_data_ready(sk, 0);
 		return;
 	}
@@ -5556,8 +5556,7 @@ no_ack:
 #endif
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
-			else
-				sk->sk_data_ready(sk, 0);
+			sk->sk_data_ready(sk, 0);
 			return 0;
 		}
 	}
-- 
cgit v1.2.1


From 6b78f16e4bdde3936b11690bdb970be423e07a07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 13 Sep 2012 21:25:33 +0000
Subject: gre: add GSO support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add GSO support to GRE tunnels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b062a98574f2..f233c1da2077 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -745,6 +745,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	__be32 dst;
 	int    mtu;
 
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    skb_checksum_help(skb))
+		goto tx_error;
+
 	if (dev->type == ARPHRD_ETHER)
 		IPCB(skb)->flags = 0;
 
@@ -1296,6 +1300,11 @@ static void ipgre_dev_free(struct net_device *dev)
 	free_netdev(dev);
 }
 
+#define GRE_FEATURES (NETIF_F_SG |		\
+		      NETIF_F_FRAGLIST |	\
+		      NETIF_F_HIGHDMA |		\
+		      NETIF_F_HW_CSUM)
+
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipgre_netdev_ops;
@@ -1309,6 +1318,9 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
 }
 
 static int ipgre_tunnel_init(struct net_device *dev)
-- 
cgit v1.2.1


From 15c041759bfcd9ab0a4e43f1c16e2644977d0467 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Fri, 14 Sep 2012 04:59:52 +0000
Subject: tcp: flush DMA queue before sk_wait_data if rcv_wnd is zero

If recv() syscall is called for a TCP socket so that
  - IOAT DMA is used
  - MSG_WAITALL flag is used
  - requested length is bigger than sk_rcvbuf
  - enough data has already arrived to bring rcv_wnd to zero
then when tcp_recvmsg() gets to calling sk_wait_data(), receive
window can be still zero while sk_async_wait_queue exhausts
enough space to keep it zero. As this queue isn't cleaned until
the tcp_service_net_dma() call, sk_wait_data() cannot receive
any data and blocks forever.

If zero receive window and non-empty sk_async_wait_queue is
detected before calling sk_wait_data(), process the queue first.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..bf9a8ab29459 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1762,8 +1762,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		}
 
 #ifdef CONFIG_NET_DMA
-		if (tp->ucopy.dma_chan)
-			dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+		if (tp->ucopy.dma_chan) {
+			if (tp->rcv_wnd == 0 &&
+			    !skb_queue_empty(&sk->sk_async_wait_queue)) {
+				tcp_service_net_dma(sk, true);
+				tcp_cleanup_rbuf(sk, copied);
+			} else
+				dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+		}
 #endif
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
-- 
cgit v1.2.1


From 6b102865e7ba9ff1e3c49c32c7187bb427d91798 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 18 Sep 2012 16:50:11 +0000
Subject: ipv6: unify fragment thresh handling code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Michal Kubeček <mkubecek@suse.cz>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 9 +++++++--
 net/ipv4/ip_fragment.c   | 5 ++---
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 85190e69297b..4750d2b74d79 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 	nf->low_thresh = 0;
 
 	local_bh_disable();
-	inet_frag_evictor(nf, f);
+	inet_frag_evictor(nf, f, true);
 	local_bh_enable();
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,11 +158,16 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 {
 	struct inet_frag_queue *q;
 	int work, evicted = 0;
 
+	if (!force) {
+		if (atomic_read(&nf->mem) <= nf->high_thresh)
+			return 0;
+	}
+
 	work = atomic_read(&nf->mem) - nf->low_thresh;
 	while (work > 0) {
 		read_lock(&f->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fa6a12c51066..448e68546827 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -219,7 +219,7 @@ static void ip_evictor(struct net *net)
 {
 	int evicted;
 
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
+	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
 	if (evicted)
 		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 }
@@ -684,8 +684,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
 	/* Start by cleaning up the memory. */
-	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
-		ip_evictor(net);
+	ip_evictor(net);
 
 	/* Lookup (or create) queue header */
 	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
-- 
cgit v1.2.1


From bb68b64724a4fd6b93d83b39aeffa4aadb2562fc Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Tue, 18 Sep 2012 14:19:23 +0000
Subject: ipv4: Don't add TCP-code in inet_sock_destruct

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: H.K. Jerry Chu <hkchu@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 --
 net/ipv4/tcp.c     | 7 +++++++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 845372b025f6..766c59658563 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,8 +149,6 @@ void inet_sock_destruct(struct sock *sk)
 		pr_err("Attempt to release alive inet socket %p\n", sk);
 		return;
 	}
-	if (sk->sk_protocol == IPPROTO_TCP)
-		kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
 
 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index df83d744e380..7b1e940393cf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2325,6 +2325,13 @@ int tcp_disconnect(struct sock *sk, int flags)
 }
 EXPORT_SYMBOL(tcp_disconnect);
 
+void tcp_sock_destruct(struct sock *sk)
+{
+	inet_sock_destruct(sk);
+
+	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
+
 static inline bool tcp_can_repair_sock(const struct sock *sk)
 {
 	return capable(CAP_NET_ADMIN) &&
-- 
cgit v1.2.1


From 4308fc58dceda40fac8a8d9c05b7cfba0a6bbed3 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 19 Sep 2012 15:46:06 +0100
Subject: tcp: Document use of undefined variable.

Both tcp_timewait_state_process and tcp_check_req use the same basic
construct of

	struct tcp_options received tmp_opt;
	tmp_opt.saw_tstamp = 0;

then call

	tcp_parse_options

However if they are fed a frame containing a TCP_SACK then tbe code
behaviour is undefined because opt_rx->sack_ok is undefined data.

This ought to be documented if it is intentional.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e965319d610b..5792577b5bc5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -85,6 +85,8 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  * spinlock it. I do not want! Well, probability of misbehaviour
  * is ridiculously low and, seems, we could use some mb() tricks
  * to avoid misread sequence numbers, states etc.  --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
  */
 enum tcp_tw_status
 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
@@ -522,6 +524,8 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
  *
  * XXX (TFO) - The current impl contains a special check for ack
  * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
  */
 
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
-- 
cgit v1.2.1


From bc26ccd8fc756749de95606d28314efd0ce5aec3 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 19 Sep 2012 09:40:00 +0000
Subject: tcp: restore rcv_wscale in a repair mode (v2)

rcv_wscale is a symetric parameter with snd_wscale.

Both this parameters are set on a connection handshake.

Without this value a remote window size can not be interpreted correctly,
because a value from a packet should be shifted on rcv_wscale.

And one more thing is that wscale_ok should be set too.

This patch doesn't break a backward compatibility.
If someone uses it in a old scheme, a rcv window
will be restored with the same bug (rcv_wscale = 0).

v2: Save backward compatibility on big-endian system. Before
    the first two bytes were snd_wscale and the second two bytes were
    rcv_wscale. Now snd_wscale is opt_val & 0xFFFF and rcv_wscale >> 16.
    This approach is independent on byte ordering.

Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
CC: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf9a8ab29459..5f6419341821 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2331,10 +2331,17 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
 			tp->rx_opt.mss_clamp = opt.opt_val;
 			break;
 		case TCPOPT_WINDOW:
-			if (opt.opt_val > 14)
-				return -EFBIG;
+			{
+				u16 snd_wscale = opt.opt_val & 0xFFFF;
+				u16 rcv_wscale = opt.opt_val >> 16;
+
+				if (snd_wscale > 14 || rcv_wscale > 14)
+					return -EFBIG;
 
-			tp->rx_opt.snd_wscale = opt.opt_val;
+				tp->rx_opt.snd_wscale = snd_wscale;
+				tp->rx_opt.rcv_wscale = rcv_wscale;
+				tp->rx_opt.wscale_ok = 1;
+			}
 			break;
 		case TCPOPT_SACK_PERM:
 			if (opt.opt_val != 0)
-- 
cgit v1.2.1


From b3d54b3e406b5d6ac391590bf7524e887e8e13c3 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Fri, 21 Sep 2012 11:37:59 +0200
Subject: netfilter: combine ipt_NETMAP and ip6t_NETMAP

Combine more modules since the actual code is so small anyway that the
kmod metadata and the module in its loaded state totally outweighs the
combined actual code size.

IP_NF_TARGET_NETMAP becomes a compat option; IP6_NF_TARGET_NETMAP
is completely eliminated since it has not see a release yet.

Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig      |  11 ++---
 net/ipv4/netfilter/Makefile     |   1 -
 net/ipv4/netfilter/ipt_NETMAP.c | 101 ----------------------------------------
 3 files changed, 5 insertions(+), 108 deletions(-)
 delete mode 100644 net/ipv4/netfilter/ipt_NETMAP.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 131e53702e77..6f140084004f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -172,12 +172,11 @@ config IP_NF_TARGET_MASQUERADE
 config IP_NF_TARGET_NETMAP
 	tristate "NETMAP target support"
 	depends on NETFILTER_ADVANCED
-	help
-	  NETMAP is an implementation of static 1:1 NAT mapping of network
-	  addresses. It maps the network address part, while keeping the host
-	  address part intact.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
+	select NETFILTER_XT_TARGET_NETMAP
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_NETMAP.
 
 config IP_NF_TARGET_REDIRECT
 	tristate "REDIRECT target support"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index b7dd18987237..f4446c5d6595 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
-obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
 obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index 85028dc0425d..000000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* NETMAP - static NAT mapping of IP network addresses (1:1).
- * The mapping can be applied to source (POSTROUTING),
- * destination (PREROUTING), or both (with separate rules).
- */
-
-/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
-MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
-
-static int netmap_tg_check(const struct xt_tgchk_param *par)
-{
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
-		pr_debug("bad MAP_IPS.\n");
-		return -EINVAL;
-	}
-	if (mr->rangesize != 1) {
-		pr_debug("bad rangesize %u.\n", mr->rangesize);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static unsigned int
-netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	__be32 new_ip, netmask;
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range newrange;
-
-	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-		     par->hooknum == NF_INET_POST_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_OUT ||
-		     par->hooknum == NF_INET_LOCAL_IN);
-	ct = nf_ct_get(skb, &ctinfo);
-
-	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
-
-	if (par->hooknum == NF_INET_PRE_ROUTING ||
-	    par->hooknum == NF_INET_LOCAL_OUT)
-		new_ip = ip_hdr(skb)->daddr & ~netmask;
-	else
-		new_ip = ip_hdr(skb)->saddr & ~netmask;
-	new_ip |= mr->range[0].min_ip & netmask;
-
-	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
-	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
-	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.ip = new_ip;
-	newrange.max_addr.ip = new_ip;
-	newrange.min_proto   = mr->range[0].min;
-	newrange.max_proto   = mr->range[0].max;
-
-	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
-}
-
-static struct xt_target netmap_tg_reg __read_mostly = {
-	.name 		= "NETMAP",
-	.family		= NFPROTO_IPV4,
-	.target 	= netmap_tg,
-	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),
-	.table		= "nat",
-	.hooks		= (1 << NF_INET_PRE_ROUTING) |
-			  (1 << NF_INET_POST_ROUTING) |
-			  (1 << NF_INET_LOCAL_OUT) |
-			  (1 << NF_INET_LOCAL_IN),
-	.checkentry 	= netmap_tg_check,
-	.me 		= THIS_MODULE
-};
-
-static int __init netmap_tg_init(void)
-{
-	return xt_register_target(&netmap_tg_reg);
-}
-
-static void __exit netmap_tg_exit(void)
-{
-	xt_unregister_target(&netmap_tg_reg);
-}
-
-module_init(netmap_tg_init);
-module_exit(netmap_tg_exit);
-- 
cgit v1.2.1


From 2cbc78a29e76a2e92c172651204f3117491877d2 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Fri, 21 Sep 2012 11:41:34 +0200
Subject: netfilter: combine ipt_REDIRECT and ip6t_REDIRECT

Combine more modules since the actual code is so small anyway that the
kmod metadata and the module in its loaded state totally outweighs the
combined actual code size.

IP_NF_TARGET_REDIRECT becomes a compat option; IP6_NF_TARGET_REDIRECT
is completely eliminated since it has not see a release yet.

Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig        |  12 ++--
 net/ipv4/netfilter/Makefile       |   1 -
 net/ipv4/netfilter/ipt_REDIRECT.c | 113 --------------------------------------
 3 files changed, 5 insertions(+), 121 deletions(-)
 delete mode 100644 net/ipv4/netfilter/ipt_REDIRECT.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 6f140084004f..d8d6f2a5bf12 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -181,13 +181,11 @@ config IP_NF_TARGET_NETMAP
 config IP_NF_TARGET_REDIRECT
 	tristate "REDIRECT target support"
 	depends on NETFILTER_ADVANCED
-	help
-	  REDIRECT is a special case of NAT: all incoming connections are
-	  mapped onto the incoming interface's address, causing the packets to
-	  come to the local machine instead of passing through.  This is
-	  useful for transparent proxies.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
+	select NETFILTER_XT_TARGET_REDIRECT
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_REDIRECT.
 
 endif
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index f4446c5d6595..007b128eecc9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
-obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index 11407d7d2472..000000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Redirect.  Simple mapping which alters dst to a local IP address. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
-
-/* FIXME: Take multiple ranges --RR */
-static int redirect_tg_check(const struct xt_tgchk_param *par)
-{
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
-		pr_debug("bad MAP_IPS.\n");
-		return -EINVAL;
-	}
-	if (mr->rangesize != 1) {
-		pr_debug("bad rangesize %u.\n", mr->rangesize);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static unsigned int
-redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	__be32 newdst;
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range newrange;
-
-	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_OUT);
-
-	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
-	/* Local packets: make them go to loopback */
-	if (par->hooknum == NF_INET_LOCAL_OUT)
-		newdst = htonl(0x7F000001);
-	else {
-		struct in_device *indev;
-		struct in_ifaddr *ifa;
-
-		newdst = 0;
-
-		rcu_read_lock();
-		indev = __in_dev_get_rcu(skb->dev);
-		if (indev && (ifa = indev->ifa_list))
-			newdst = ifa->ifa_local;
-		rcu_read_unlock();
-
-		if (!newdst)
-			return NF_DROP;
-	}
-
-	/* Transfer from original range. */
-	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
-	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
-	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.ip = newdst;
-	newrange.max_addr.ip = newdst;
-	newrange.min_proto   = mr->range[0].min;
-	newrange.max_proto   = mr->range[0].max;
-
-	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
-}
-
-static struct xt_target redirect_tg_reg __read_mostly = {
-	.name		= "REDIRECT",
-	.family		= NFPROTO_IPV4,
-	.target		= redirect_tg,
-	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),
-	.table		= "nat",
-	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
-	.checkentry	= redirect_tg_check,
-	.me		= THIS_MODULE,
-};
-
-static int __init redirect_tg_init(void)
-{
-	return xt_register_target(&redirect_tg_reg);
-}
-
-static void __exit redirect_tg_exit(void)
-{
-	xt_unregister_target(&redirect_tg_reg);
-}
-
-module_init(redirect_tg_init);
-module_exit(redirect_tg_exit);
-- 
cgit v1.2.1


From bf5b30b8a4416de04f1ac1196281ddb318669464 Mon Sep 17 00:00:00 2001
From: Zhao Hongjiang <zhaohongjiang@huawei.com>
Date: Thu, 20 Sep 2012 22:37:25 +0000
Subject: net: change return values from -EACCES to -EPERM

Change return value from -EACCES to -EPERM when the permission check fails.

Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 9b55b6f5a585..e12fad773852 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -725,7 +725,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		break;
 
 	case SIOCSIFFLAGS:
-		ret = -EACCES;
+		ret = -EPERM;
 		if (!capable(CAP_NET_ADMIN))
 			goto out;
 		break;
@@ -733,7 +733,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCSIFBRDADDR:	/* Set the broadcast address */
 	case SIOCSIFDSTADDR:	/* Set the destination address */
 	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
-		ret = -EACCES;
+		ret = -EPERM;
 		if (!capable(CAP_NET_ADMIN))
 			goto out;
 		ret = -EINVAL;
-- 
cgit v1.2.1


From 5e953778a2aab04929a5e7b69f53dc26e39b079e Mon Sep 17 00:00:00 2001
From: Christoph Fritz <chf.fritz@googlemail.com>
Date: Fri, 21 Sep 2012 08:31:19 +0000
Subject: ipconfig: add nameserver IPs to kernel-parameter ip=

On small systems (e.g. embedded ones) IP addresses are often configured
by bootloaders and get assigned to kernel via parameter "ip=".  If set to
"ip=dhcp", even nameserver entries from DHCP daemons are handled. These
entries exported in /proc/net/pnp are commonly linked by /etc/resolv.conf.

To configure nameservers for networks without DHCP, this patch adds option
<dns0-ip> and <dns1-ip> to kernel-parameter 'ip='.

Signed-off-by: Christoph Fritz <chf.fritz@googlemail.com>
Tested-by: Jan Weitzel <j.weitzel@phytec.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 67e8a6b086ea..1c0e7e051044 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -743,14 +743,22 @@ static void __init ic_bootp_init_ext(u8 *e)
 
 
 /*
- *  Initialize the DHCP/BOOTP mechanism.
+ *  Predefine Nameservers
  */
-static inline void __init ic_bootp_init(void)
+static inline void __init ic_nameservers_predef(void)
 {
 	int i;
 
 	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
 		ic_nameservers[i] = NONE;
+}
+
+/*
+ *  Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void __init ic_bootp_init(void)
+{
+	ic_nameservers_predef();
 
 	dev_add_pack(&bootp_packet_type);
 }
@@ -1379,6 +1387,7 @@ static int __init ip_auto_config(void)
 	int retries = CONF_OPEN_RETRIES;
 #endif
 	int err;
+	unsigned int i;
 
 #ifdef CONFIG_PROC_FS
 	proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1499,7 +1508,15 @@ static int __init ip_auto_config(void)
 		&ic_servaddr, &root_server_addr, root_server_path);
 	if (ic_dev_mtu)
 		pr_cont(", mtu=%d", ic_dev_mtu);
-	pr_cont("\n");
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+		if (ic_nameservers[i] != NONE) {
+			pr_info("     nameserver%u=%pI4",
+				i, &ic_nameservers[i]);
+			break;
+		}
+	for (i++; i < CONF_NAMESERVERS_MAX; i++)
+		if (ic_nameservers[i] != NONE)
+			pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
 #endif /* !SILENT */
 
 	return 0;
@@ -1570,6 +1587,8 @@ static int __init ip_auto_config_setup(char *addrs)
 		return 1;
 	}
 
+	ic_nameservers_predef();
+
 	/* Parse string for static IP assignment.  */
 	ip = addrs;
 	while (ip && *ip) {
@@ -1613,6 +1632,20 @@ static int __init ip_auto_config_setup(char *addrs)
 					ic_enable = 0;
 				}
 				break;
+			case 7:
+				if (CONF_NAMESERVERS_MAX >= 1) {
+					ic_nameservers[0] = in_aton(ip);
+					if (ic_nameservers[0] == ANY)
+						ic_nameservers[0] = NONE;
+				}
+				break;
+			case 8:
+				if (CONF_NAMESERVERS_MAX >= 2) {
+					ic_nameservers[1] = in_aton(ip);
+					if (ic_nameservers[1] == ANY)
+						ic_nameservers[1] = NONE;
+				}
+				break;
 			}
 		}
 		ip = cp;
-- 
cgit v1.2.1


From ab43ed8b7490cb387782423ecf74aeee7237e591 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 22 Sep 2012 00:08:29 +0000
Subject: ipv4: raw: fix icmp_filter()

icmp_filter() should not modify its input, or else its caller
would need to recompute ip_hdr() if skb->head is reallocated.

Use skb_header_pointer() instead of pskb_may_pull() and
change the prototype to make clear both sk and skb are const.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff0f071969ea..d23c6571ba1c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,18 +131,20 @@ found:
  *	0 - deliver
  *	1 - block
  */
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
 {
-	int type;
+	struct icmphdr _hdr;
+	const struct icmphdr *hdr;
 
-	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+				 sizeof(_hdr), &_hdr);
+	if (!hdr)
 		return 1;
 
-	type = icmp_hdr(skb)->type;
-	if (type < 32) {
+	if (hdr->type < 32) {
 		__u32 data = raw_sk(sk)->filter.data;
 
-		return ((1 << type) & data) != 0;
+		return ((1U << hdr->type) & data) != 0;
 	}
 
 	/* Do not block unknown ICMP types */
-- 
cgit v1.2.1


From 623df484a777f3c00c1ea3d6a7565b8d8ac688a1 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 04:18:54 +0000
Subject: tcp: extract code to compute SYNACK RTT

In preparation for adding another spot where we compute the SYNACK
RTT, extract this code so that it can be shared.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e64abed249cc..1e66f7fb4fe6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1733,9 +1733,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_initialize_rcv_mss(newsk);
-	if (tcp_rsk(req)->snt_synack)
-		tcp_valid_rtt_meas(newsk,
-		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+	tcp_synack_rtt_meas(newsk, req);
 	newtp->total_retrans = req->retrans;
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.1


From 016818d076871c4ee34db1e8d74dc17ac1de626a Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 04:18:55 +0000
Subject: tcp: TCP Fast Open Server - take SYNACK RTT after completing 3WHS

When taking SYNACK RTT samples for servers using TCP Fast Open, fix
the code to ensure that we only call tcp_valid_rtt_meas() after we
receive the ACK that completes the 3-way handshake.

Previously we were always taking an RTT sample in
tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections
tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we
receive the SYN. So for TFO we must wait until tcp_rcv_state_process()
to take the RTT sample.

To fix this, we wait until after TFO calls tcp_v4_syn_recv_sock()
before we set the snt_synack timestamp, since tcp_synack_rtt_meas()
already ensures that we only take a SYNACK RTT sample if snt_synack is
non-zero. To be careful, we only take a snt_synack timestamp when
a SYNACK transmit or retransmit succeeds.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c |  1 +
 net/ipv4/tcp_ipv4.c  | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e2bec815ff23..bb326684495b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5983,6 +5983,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 * need req so release it.
 				 */
 				if (req) {
+					tcp_synack_rtt_meas(sk, req);
 					reqsk_fastopen_remove(sk, req, false);
 				} else {
 					/* Make sure socket is routed, for
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e66f7fb4fe6..0a7e020f16b5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -868,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 					    ireq->rmt_addr,
 					    ireq->opt);
 		err = net_xmit_eval(err);
+		if (!tcp_rsk(req)->snt_synack && !err)
+			tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	}
 
 	return err;
@@ -1382,6 +1384,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct sock *child;
+	int err;
 
 	req->retrans = 0;
 	req->sk = NULL;
@@ -1393,8 +1396,11 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 		kfree_skb(skb_synack);
 		return -1;
 	}
-	ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
-			ireq->rmt_addr, ireq->opt);
+	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+				    ireq->rmt_addr, ireq->opt);
+	err = net_xmit_eval(err);
+	if (!err)
+		tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	/* XXX (TFO) - is it ok to ignore error and continue? */
 
 	spin_lock(&queue->fastopenq->lock);
@@ -1612,7 +1618,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = tcp_v4_init_sequence(skb);
 	}
 	tcp_rsk(req)->snt_isn = isn;
-	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	if (dst == NULL) {
 		dst = inet_csk_route_req(sk, &fl4, req);
@@ -1650,6 +1655,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (err || want_cookie)
 			goto drop_and_free;
 
+		tcp_rsk(req)->snt_synack = tcp_time_stamp;
 		tcp_rsk(req)->listener = NULL;
 		/* Add the request_sock to the SYN table */
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
-- 
cgit v1.2.1


From 072539880156fafd86c753070e598c23e8e74586 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 04:18:56 +0000
Subject: tcp: TCP Fast Open Server - note timestamps and retransmits for
 SYNACK RTT

Previously, when using TCP Fast Open a server would return from
tcp_check_req() before updating snt_synack based on TCP timestamp echo
replies and whether or not we've retransmitted the SYNACK. The result
was that (a) for TFO connections using timestamps we used an incorrect
baseline SYNACK send time (tcp_time_stamp of SYNACK send instead of
rcv_tsecr), and (b) for TFO connections that do not have TCP
timestamps but retransmit the SYNACK we took a SYNACK RTT sample when
we should not take a sample.

This fix merely moves the snt_synack update logic a bit earlier in the
function, so that connections using TCP Fast Open will properly do
these updates when the ACK for the SYNACK arrives.

Moving this snt_synack update logic means that with TCP_DEFER_ACCEPT
enabled we do a few instructions of wasted work on each bare ACK, but
that seems OK.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 5792577b5bc5..27536ba16c9d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -692,6 +692,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!(flg & TCP_FLAG_ACK))
 		return NULL;
 
+	/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
+	if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+		tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+	else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+		tcp_rsk(req)->snt_synack = 0;
+
 	/* For Fast Open no more processing is needed (sk is the
 	 * child socket).
 	 */
@@ -705,10 +711,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
 		return NULL;
 	}
-	if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
-		tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
-	else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
-		tcp_rsk(req)->snt_synack = 0;
 
 	/* OK, ACK is valid, create big socket and
 	 * feed this segment to it. It will repeat all
-- 
cgit v1.2.1


From e69bebde46b5ddfb7d8f17cae66031e282ebc464 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 04:18:57 +0000
Subject: tcp: TCP Fast Open Server - call tcp_validate_incoming() for all
 packets

A TCP Fast Open (TFO) passive connection must call both
tcp_check_req() and tcp_validate_incoming() for all incoming ACKs that
are attempting to complete the 3WHS.

This is needed to parallel all the action that happens for a non-TFO
connection, where for an ACK that is attempting to complete the 3WHS
we call both tcp_check_req() and tcp_validate_incoming().

For example, upon receiving the ACK that completes the 3WHS, we need
to call tcp_fast_parse_options() and update ts_recent based on the
incoming timestamp value in the ACK.

One symptom of the problem with the previous code was that for passive
TFO connections using TCP timestamps, the outgoing TS ecr values
ignored the incoming TS val value on the ACK that completed the 3WHS.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb326684495b..36e069a1f47b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5969,7 +5969,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
 			goto discard;
-	} else if (!tcp_validate_incoming(sk, skb, th, 0))
+	}
+	if (!tcp_validate_incoming(sk, skb, th, 0))
 		return 0;
 
 	/* step 5: check the ACK field */
-- 
cgit v1.2.1


From 30099b2e9b3a1f88250d31472edf2b6d6f0a7d8b Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 17:03:47 +0000
Subject: tcp: TCP Fast Open Server - record retransmits after 3WHS

When recording the number of SYNACK retransmits for servers using TCP
Fast Open, fix the code to ensure that we copy over the retransmit
count from the request_sock after we receive the ACK that completes
the 3-way handshake.

The story here is similar to that of SYNACK RTT
measurements. Previously we were always doing this in
tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections
tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we
receive the SYN. So for TFO we must copy the final SYNACK retransmit
count in tcp_rcv_state_process().

Note that copying over the SYNACK retransmit count will give us the
correct count since, as is mentioned in a comment in
tcp_retransmit_timer(), before we receive an ACK for our SYN-ACK a TFO
passive connection does not retransmit anything else (e.g., data or
FIN segments).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 36e069a1f47b..e037697c3b77 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5985,6 +5985,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (req) {
 					tcp_synack_rtt_meas(sk, req);
+					tp->total_retrans = req->retrans;
+
 					reqsk_fastopen_remove(sk, req, false);
 				} else {
 					/* Make sure socket is routed, for
-- 
cgit v1.2.1


From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 23 Sep 2012 23:04:42 +0000
Subject: net: use a per task frag allocator

We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.

This page is used to build fragments for skbs.

Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)

But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page

Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.

This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.

(up to 32768 bytes per frag, thats order-3 pages on x86)

This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.

Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536

Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 70 +++++++++++++++++++---------------------------
 net/ipv4/raw.c       | 19 ++++++++-----
 net/ipv4/tcp.c       | 79 +++++++++++++++-------------------------------------
 net/ipv4/tcp_ipv4.c  |  8 ------
 4 files changed, 62 insertions(+), 114 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a5beab1dc958..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -793,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
 			    struct flowi4 *fl4,
 			    struct sk_buff_head *queue,
 			    struct inet_cork *cork,
+			    struct page_frag *pfrag,
 			    int getfrag(void *from, char *to, int offset,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
@@ -987,47 +988,30 @@ alloc_new_skb:
 			}
 		} else {
 			int i = skb_shinfo(skb)->nr_frags;
-			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-			struct page *page = cork->page;
-			int off = cork->off;
-			unsigned int left;
-
-			if (page && (left = PAGE_SIZE - off) > 0) {
-				if (copy >= left)
-					copy = left;
-				if (page != skb_frag_page(frag)) {
-					if (i == MAX_SKB_FRAGS) {
-						err = -EMSGSIZE;
-						goto error;
-					}
-					skb_fill_page_desc(skb, i, page, off, 0);
-					skb_frag_ref(skb, i);
-					frag = &skb_shinfo(skb)->frags[i];
-				}
-			} else if (i < MAX_SKB_FRAGS) {
-				if (copy > PAGE_SIZE)
-					copy = PAGE_SIZE;
-				page = alloc_pages(sk->sk_allocation, 0);
-				if (page == NULL)  {
-					err = -ENOMEM;
-					goto error;
-				}
-				cork->page = page;
-				cork->off = 0;
 
-				skb_fill_page_desc(skb, i, page, 0, 0);
-				frag = &skb_shinfo(skb)->frags[i];
-			} else {
-				err = -EMSGSIZE;
-				goto error;
-			}
-			if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
-				    offset, copy, skb->len, skb) < 0) {
-				err = -EFAULT;
+			err = -ENOMEM;
+			if (!sk_page_frag_refill(sk, pfrag))
 				goto error;
+
+			if (!skb_can_coalesce(skb, i, pfrag->page,
+					      pfrag->offset)) {
+				err = -EMSGSIZE;
+				if (i == MAX_SKB_FRAGS)
+					goto error;
+
+				__skb_fill_page_desc(skb, i, pfrag->page,
+						     pfrag->offset, 0);
+				skb_shinfo(skb)->nr_frags = ++i;
+				get_page(pfrag->page);
 			}
-			cork->off += copy;
-			skb_frag_size_add(frag, copy);
+			copy = min_t(int, copy, pfrag->size - pfrag->offset);
+			if (getfrag(from,
+				    page_address(pfrag->page) + pfrag->offset,
+				    offset, copy, skb->len, skb) < 0)
+				goto error_efault;
+
+			pfrag->offset += copy;
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 			skb->len += copy;
 			skb->data_len += copy;
 			skb->truesize += copy;
@@ -1039,6 +1023,8 @@ alloc_new_skb:
 
 	return 0;
 
+error_efault:
+	err = -EFAULT;
 error:
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1079,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->tx_flags = ipc->tx_flags;
-	cork->page = NULL;
-	cork->off = 0;
 
 	return 0;
 }
@@ -1117,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
 		transhdrlen = 0;
 	}
 
-	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
+	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+				sk_page_frag(sk), getfrag,
 				from, length, transhdrlen, flags);
 }
 
@@ -1439,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 	if (err)
 		return ERR_PTR(err);
 
-	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
+	err = __ip_append_data(sk, fl4, &queue, &cork,
+			       &current->task_frag, getfrag,
 			       from, length, transhdrlen, flags);
 	if (err) {
 		__ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f2425785d40a..a80740ba4248 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,18 +131,23 @@ found:
  *	0 - deliver
  *	1 - block
  */
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
 {
-	int type;
-
-	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+	struct icmphdr _hdr;
+	const struct icmphdr *hdr;
+
+	pr_err("icmp_filter skb_transport_offset %d data-head %ld len %d/%d\n",
+		skb_transport_offset(skb), skb->data - skb->head, skb->len, skb->data_len);
+	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+				 sizeof(_hdr), &_hdr);
+	pr_err("head %p data %p hdr %p type %d\n", skb->head, skb->data, hdr, hdr ? hdr->type : -1);
+	if (!hdr)
 		return 1;
 
-	type = icmp_hdr(skb)->type;
-	if (type < 32) {
+	if (hdr->type < 32) {
 		__u32 data = raw_sk(sk)->filter.data;
 
-		return ((1 << type) & data) != 0;
+		return ((1U << hdr->type) & data) != 0;
 	}
 
 	/* Do not block unknown ICMP types */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940393cf..72ea4752f21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
 				if (err)
 					goto do_fault;
 			} else {
-				bool merge = false;
+				bool merge = true;
 				int i = skb_shinfo(skb)->nr_frags;
-				struct page *page = sk->sk_sndmsg_page;
-				int off;
-
-				if (page && page_count(page) == 1)
-					sk->sk_sndmsg_off = 0;
-
-				off = sk->sk_sndmsg_off;
-
-				if (skb_can_coalesce(skb, i, page, off) &&
-				    off != PAGE_SIZE) {
-					/* We can extend the last page
-					 * fragment. */
-					merge = true;
-				} else if (i == MAX_SKB_FRAGS || !sg) {
-					/* Need to add new fragment and cannot
-					 * do this because interface is non-SG,
-					 * or because all the page slots are
-					 * busy. */
-					tcp_mark_push(tp, skb);
-					goto new_segment;
-				} else if (page) {
-					if (off == PAGE_SIZE) {
-						put_page(page);
-						sk->sk_sndmsg_page = page = NULL;
-						off = 0;
+				struct page_frag *pfrag = sk_page_frag(sk);
+
+				if (!sk_page_frag_refill(sk, pfrag))
+					goto wait_for_memory;
+
+				if (!skb_can_coalesce(skb, i, pfrag->page,
+						      pfrag->offset)) {
+					if (i == MAX_SKB_FRAGS || !sg) {
+						tcp_mark_push(tp, skb);
+						goto new_segment;
 					}
-				} else
-					off = 0;
+					merge = false;
+				}
 
-				if (copy > PAGE_SIZE - off)
-					copy = PAGE_SIZE - off;
+				copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
 				if (!sk_wmem_schedule(sk, copy))
 					goto wait_for_memory;
 
-				if (!page) {
-					/* Allocate new cache page. */
-					if (!(page = sk_stream_alloc_page(sk)))
-						goto wait_for_memory;
-				}
-
-				/* Time to copy data. We are close to
-				 * the end! */
 				err = skb_copy_to_page_nocache(sk, from, skb,
-							       page, off, copy);
-				if (err) {
-					/* If this page was new, give it to the
-					 * socket so it does not get leaked.
-					 */
-					if (!sk->sk_sndmsg_page) {
-						sk->sk_sndmsg_page = page;
-						sk->sk_sndmsg_off = 0;
-					}
+							       pfrag->page,
+							       pfrag->offset,
+							       copy);
+				if (err)
 					goto do_error;
-				}
 
 				/* Update the skb. */
 				if (merge) {
 					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 				} else {
-					skb_fill_page_desc(skb, i, page, off, copy);
-					if (sk->sk_sndmsg_page) {
-						get_page(page);
-					} else if (off + copy < PAGE_SIZE) {
-						get_page(page);
-						sk->sk_sndmsg_page = page;
-					}
+					skb_fill_page_desc(skb, i, pfrag->page,
+							   pfrag->offset, copy);
+					get_page(pfrag->page);
 				}
-
-				sk->sk_sndmsg_off = off + copy;
+				pfrag->offset += copy;
 			}
 
 			if (!copied)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0a7e020f16b5..93406c583f43 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2200,14 +2200,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	if (inet_csk(sk)->icsk_bind_hash)
 		inet_put_port(sk);
 
-	/*
-	 * If sendmsg cached page exists, toss it.
-	 */
-	if (sk->sk_sndmsg_page) {
-		__free_page(sk->sk_sndmsg_page);
-		sk->sk_sndmsg_page = NULL;
-	}
-
 	/* TCP Cookie Transactions */
 	if (tp->cookie_values != NULL) {
 		kref_put(&tp->cookie_values->kref,
-- 
cgit v1.2.1


From 8489c1d9a80e6263fb71de5bc7f39773712cfa7e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Sep 2012 08:21:16 +0200
Subject: net: raw: revert unrelated change

Commit 5640f7685831 ("net: use a per task frag allocator")
accidentally contained an unrelated change to net/ipv4/raw.c,
later committed (without the pr_err() debugging bits) in
net tree as commit ab43ed8b749 (ipv4: raw: fix icmp_filter())

This patch reverts this glitch, noticed by Stephen Rothwell.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a80740ba4248..f2425785d40a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,23 +131,18 @@ found:
  *	0 - deliver
  *	1 - block
  */
-static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
 {
-	struct icmphdr _hdr;
-	const struct icmphdr *hdr;
-
-	pr_err("icmp_filter skb_transport_offset %d data-head %ld len %d/%d\n",
-		skb_transport_offset(skb), skb->data - skb->head, skb->len, skb->data_len);
-	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
-				 sizeof(_hdr), &_hdr);
-	pr_err("head %p data %p hdr %p type %d\n", skb->head, skb->data, hdr, hdr ? hdr->type : -1);
-	if (!hdr)
+	int type;
+
+	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
 		return 1;
 
-	if (hdr->type < 32) {
+	type = icmp_hdr(skb)->type;
+	if (type < 32) {
 		__u32 data = raw_sk(sk)->filter.data;
 
-		return ((1U << hdr->type) & data) != 0;
+		return ((1 << type) & data) != 0;
 	}
 
 	/* Do not block unknown ICMP types */
-- 
cgit v1.2.1


From 842b08bbee448b2069aaebb7f18b40942ad2a1bd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 24 Sep 2012 22:09:58 +0000
Subject: ipconfig: fix trivial build error

The commit 5e953778a2aab04929a5e7b69f53dc26e39b079e ("ipconfig: add nameserver
IPs to kernel-parameter ip=") introduces ic_nameservers_predef() that defined
only for BOOTP. However it is used by ip_auto_config_setup() as well. This
patch moves it outside of #ifdef BOOTP.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Fritz <chf.fritz@googlemail.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 1c0e7e051044..798358b10717 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -582,6 +582,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)
 }
 #endif
 
+/*
+ *  Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+		ic_nameservers[i] = NONE;
+}
+
 /*
  *	DHCP/BOOTP support.
  */
@@ -742,17 +753,6 @@ static void __init ic_bootp_init_ext(u8 *e)
 }
 
 
-/*
- *  Predefine Nameservers
- */
-static inline void __init ic_nameservers_predef(void)
-{
-	int i;
-
-	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
-		ic_nameservers[i] = NONE;
-}
-
 /*
  *  Initialize the DHCP/BOOTP mechanism.
  */
-- 
cgit v1.2.1


From d208328765a0ab425e36b5b828285f3337a85451 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 24 Sep 2012 18:12:23 +0000
Subject: gre: fix handling of key 0

GRE driver incorrectly uses zero as a flag value. Zero is a perfectly
valid value for key, and the tunnel should match packets with no key only
with tunnels created without key, and vice versa.

This is a slightly visible  change since previously it might be possible to
construct a working tunnel that sent key 0 and received only because
of the key wildcard of zero.  I.e the sender sent key of zero, but tunnel
was defined without key.

Note: using gre key 0 requires iproute2 utilities v3.2 or later.
The original utility code was broken as well.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f233c1da2077..0d4c3832d490 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -214,11 +214,25 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 	return tot;
 }
 
+/* Does key in tunnel parameters match packet */
+static bool ipgre_key_match(const struct ip_tunnel_parm *p,
+			    __u32 flags, __be32 key)
+{
+	if (p->i_flags & GRE_KEY) {
+		if (flags & GRE_KEY)
+			return key == p->i_key;
+		else
+			return false;	/* key expected, none present */
+	} else
+		return !(flags & GRE_KEY);
+}
+
 /* Given src, dst and key, find appropriate for input tunnel. */
 
 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 					     __be32 remote, __be32 local,
-					     __be32 key, __be16 gre_proto)
+					     __u32 flags, __be32 key,
+					     __be16 gre_proto)
 {
 	struct net *net = dev_net(dev);
 	int link = dev->ifindex;
@@ -233,10 +247,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 		if (local != t->parms.iph.saddr ||
 		    remote != t->parms.iph.daddr ||
-		    key != t->parms.i_key ||
 		    !(t->dev->flags & IFF_UP))
 			continue;
 
+		if (!ipgre_key_match(&t->parms, flags, key))
+			continue;
+
 		if (t->dev->type != ARPHRD_IPGRE &&
 		    t->dev->type != dev_type)
 			continue;
@@ -257,10 +273,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 
 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 		if (remote != t->parms.iph.daddr ||
-		    key != t->parms.i_key ||
 		    !(t->dev->flags & IFF_UP))
 			continue;
 
+		if (!ipgre_key_match(&t->parms, flags, key))
+			continue;
+
 		if (t->dev->type != ARPHRD_IPGRE &&
 		    t->dev->type != dev_type)
 			continue;
@@ -283,10 +301,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 		if ((local != t->parms.iph.saddr &&
 		     (local != t->parms.iph.daddr ||
 		      !ipv4_is_multicast(local))) ||
-		    key != t->parms.i_key ||
 		    !(t->dev->flags & IFF_UP))
 			continue;
 
+		if (!ipgre_key_match(&t->parms, flags, key))
+			continue;
+
 		if (t->dev->type != ARPHRD_IPGRE &&
 		    t->dev->type != dev_type)
 			continue;
@@ -489,6 +509,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 	const int code = icmp_hdr(skb)->code;
 	struct ip_tunnel *t;
 	__be16 flags;
+	__be32 key = 0;
 
 	flags = p[0];
 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -505,6 +526,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 	if (skb_headlen(skb) < grehlen)
 		return;
 
+	if (flags & GRE_KEY)
+		key = *(((__be32 *)p) + (grehlen / 4) - 1);
+
 	switch (type) {
 	default:
 	case ICMP_PARAMETERPROB:
@@ -535,9 +559,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 
 	rcu_read_lock();
 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
-				flags & GRE_KEY ?
-				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
-				p[1]);
+				flags, key, p[1]);
+
 	if (t == NULL)
 		goto out;
 
@@ -642,9 +665,10 @@ static int ipgre_rcv(struct sk_buff *skb)
 	gre_proto = *(__be16 *)(h + 2);
 
 	rcu_read_lock();
-	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
-					  iph->saddr, iph->daddr, key,
-					  gre_proto))) {
+	tunnel = ipgre_tunnel_lookup(skb->dev,
+				     iph->saddr, iph->daddr, flags, key,
+				     gre_proto);
+	if (tunnel) {
 		struct pcpu_tstats *tstats;
 
 		secpath_reset(skb);
-- 
cgit v1.2.1


From 0c5794a66c7c6443759e6098841767958a031187 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 24 Sep 2012 18:12:24 +0000
Subject: gre: remove unnecessary rcu_read_lock/unlock

The gre function pointers for receive and error handling are
always called (from gre.c) with rcu_read_lock already held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0d4c3832d490..1c012cb2cb94 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -557,37 +557,34 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 		break;
 	}
 
-	rcu_read_lock();
 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 				flags, key, p[1]);
 
 	if (t == NULL)
-		goto out;
+		return;
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 				 t->parms.link, 0, IPPROTO_GRE, 0);
-		goto out;
+		return;
 	}
 	if (type == ICMP_REDIRECT) {
 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
 			      IPPROTO_GRE, 0);
-		goto out;
+		return;
 	}
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
-		goto out;
+		return;
 
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-		goto out;
+		return;
 
 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
-out:
-	rcu_read_unlock();
 }
 
 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
@@ -625,7 +622,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	__be16 gre_proto;
 
 	if (!pskb_may_pull(skb, 16))
-		goto drop_nolock;
+		goto drop;
 
 	iph = ip_hdr(skb);
 	h = skb->data;
@@ -636,7 +633,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		   - We do not support routing headers.
 		 */
 		if (flags&(GRE_VERSION|GRE_ROUTING))
-			goto drop_nolock;
+			goto drop;
 
 		if (flags&GRE_CSUM) {
 			switch (skb->ip_summed) {
@@ -664,7 +661,6 @@ static int ipgre_rcv(struct sk_buff *skb)
 
 	gre_proto = *(__be16 *)(h + 2);
 
-	rcu_read_lock();
 	tunnel = ipgre_tunnel_lookup(skb->dev,
 				     iph->saddr, iph->daddr, flags, key,
 				     gre_proto);
@@ -740,14 +736,11 @@ static int ipgre_rcv(struct sk_buff *skb)
 
 		netif_rx(skb);
 
-		rcu_read_unlock();
 		return 0;
 	}
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
 drop:
-	rcu_read_unlock();
-drop_nolock:
 	kfree_skb(skb);
 	return 0;
 }
-- 
cgit v1.2.1


From b0558ef24a792906914fcad277f3befe2420e618 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 24 Sep 2012 18:12:25 +0000
Subject: xfrm: remove extranous rcu_read_lock

The handlers for xfrm_tunnel are always invoked with rcu read lock
already.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 5 -----
 net/ipv4/ipip.c   | 9 +--------
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3511ffba7bd4..978bca4818ae 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -304,7 +304,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
 
 	err = -ENOENT;
 
-	rcu_read_lock();
 	t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
 	if (t == NULL)
 		goto out;
@@ -326,7 +325,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
 		t->err_count = 1;
 	t->err_time = jiffies;
 out:
-	rcu_read_unlock();
 	return err;
 }
 
@@ -336,7 +334,6 @@ static int vti_rcv(struct sk_buff *skb)
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
 
-	rcu_read_lock();
 	tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
@@ -348,10 +345,8 @@ static int vti_rcv(struct sk_buff *skb)
 		u64_stats_update_end(&tstats->syncp);
 
 		skb->dev = tunnel->dev;
-		rcu_read_unlock();
 		return 1;
 	}
-	rcu_read_unlock();
 
 	return -1;
 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 99af1f0cc658..618bde867ac1 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -365,8 +365,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	}
 
 	err = -ENOENT;
-
-	rcu_read_lock();
 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
 	if (t == NULL)
 		goto out;
@@ -398,7 +396,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		t->err_count = 1;
 	t->err_time = jiffies;
 out:
-	rcu_read_unlock();
+
 	return err;
 }
 
@@ -416,13 +414,11 @@ static int ipip_rcv(struct sk_buff *skb)
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
 
-	rcu_read_lock();
 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
 
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-			rcu_read_unlock();
 			kfree_skb(skb);
 			return 0;
 		}
@@ -445,11 +441,8 @@ static int ipip_rcv(struct sk_buff *skb)
 		ipip_ecn_decapsulate(iph, skb);
 
 		netif_rx(skb);
-
-		rcu_read_unlock();
 		return 0;
 	}
-	rcu_read_unlock();
 
 	return -1;
 }
-- 
cgit v1.2.1


From eccc1bb8d4b4cf68d3c9becb083fa94ada7d495c Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 25 Sep 2012 11:02:48 +0000
Subject: tunnel: drop packet if ECN present with not-ECT

Linux tunnels were written before RFC6040 and therefore never
implemented the corner case of ECN getting set in the outer header
and the inner header not being ready for it.

Section 4.2.  Default Tunnel Egress Behaviour.
 o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
      propagate any other ECN codepoint onwards.  This is because the
      inner Not-ECT marking is set by transports that rely on dropped
      packets as an indication of congestion and would not understand or
      respond to any other ECN codepoint [RFC4774].  Specifically:

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         CE, the decapsulator MUST drop the packet.

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
         outgoing packet with the ECN field cleared to Not-ECT.

This patch moves the ECN decap logic out of the individual tunnels
into a common place.

It also adds logging to allow detecting broken systems that
set ECN bits incorrectly when tunneling (or an intermediate
router might be changing the header).

Overloads rx_frame_error to keep track of ECN related error.

Thanks to Chris Wright who caught this while reviewing the new VXLAN
tunnel.

This code was tested by injecting faulty logic in other end GRE
to send incorrectly encapsulated packets.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 38 ++++++++++++++++++++++----------------
 net/ipv4/ipip.c   | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 47 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1c012cb2cb94..ef0b861ce044 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -120,6 +120,10 @@
    Alexey Kuznetsov.
  */
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
@@ -204,7 +208,9 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
+	tot->rx_frame_errors = dev->stats.rx_frame_errors;
 	tot->rx_errors = dev->stats.rx_errors;
+
 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
@@ -587,17 +593,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 	t->err_time = jiffies;
 }
 
-static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
-{
-	if (INET_ECN_is_ce(iph->tos)) {
-		if (skb->protocol == htons(ETH_P_IP)) {
-			IP_ECN_set_ce(ip_hdr(skb));
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
-			IP6_ECN_set_ce(ipv6_hdr(skb));
-		}
-	}
-}
-
 static inline u8
 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 {
@@ -620,6 +615,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	struct ip_tunnel *tunnel;
 	int    offset = 4;
 	__be16 gre_proto;
+	int    err;
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop;
@@ -723,17 +719,27 @@ static int ipgre_rcv(struct sk_buff *skb)
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		skb_reset_network_header(skb);
+		err = IP_ECN_decapsulate(iph, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &iph->saddr, iph->tos);
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		skb_reset_network_header(skb);
-		ipgre_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 
 		return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 618bde867ac1..e15b45297c09 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,6 +120,10 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static int ipip_net_id __read_mostly;
 struct ipip_net {
 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -400,28 +404,18 @@ out:
 	return err;
 }
 
-static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
-					struct sk_buff *skb)
-{
-	struct iphdr *inner_iph = ip_hdr(skb);
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int ipip_rcv(struct sk_buff *skb)
 {
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
+	int err;
 
 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
 
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-			kfree_skb(skb);
-			return 0;
-		}
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
 
 		secpath_reset(skb);
 
@@ -430,21 +424,35 @@ static int ipip_rcv(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IP);
 		skb->pkt_type = PACKET_HOST;
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		err = IP_ECN_decapsulate(iph, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &iph->saddr, iph->tos);
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		ipip_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 		return 0;
 	}
 
 	return -1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
 }
 
 /*
-- 
cgit v1.2.1


From 5dff747b7038d10f9c174a1245263fd1c36a644d Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Wed, 26 Sep 2012 11:59:09 +0000
Subject: tcp: Remove unused parameter from tcp_v4_save_options

struct sock *sk is not used inside tcp_v4_save_options. Thus it can be
removed.

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 93406c583f43..385eb79cf6aa 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -925,8 +925,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
 /*
  * Save and compile IPv4 options into the request_sock if needed.
  */
-static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
-						  struct sk_buff *skb)
+static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 {
 	const struct ip_options *opt = &(IPCB(skb)->opt);
 	struct ip_options_rcu *dopt = NULL;
@@ -1568,7 +1567,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq->loc_addr = daddr;
 	ireq->rmt_addr = saddr;
 	ireq->no_srccheck = inet_sk(sk)->transparent;
-	ireq->opt = tcp_v4_save_options(sk, skb);
+	ireq->opt = tcp_v4_save_options(skb);
 
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
-- 
cgit v1.2.1


From bc9259a8bae9e814fc1f775a1b3effa13e6ad330 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 27 Sep 2012 04:11:00 +0000
Subject: inetpeer: fix token initialization

When jiffies wraps around (for example, 5 minutes after the boot, see
INITIAL_JIFFIES) and peer has just been created, now - peer->rate_last can be
< XRLIM_BURST_FACTOR * timeout, so token is not set to the maximum value, thus
some icmp packets can be unexpectedly dropped.

Fix this case by initializing last_rate to 60 seconds in the past.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e1e0a4e8fd34..c7527f6b9ad9 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -510,7 +510,10 @@ relookup:
 					secure_ipv6_id(daddr->addr.a6));
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
-		p->rate_last = 0;
+		/* 60*HZ is arbitrary, but chosen enough high so that the first
+		 * calculation of tokens is at its maximum.
+		 */
+		p->rate_last = jiffies - 60*HZ;
 		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */
-- 
cgit v1.2.1


From 861b650101eb0c627d171eb18de81dddb93d395e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 27 Sep 2012 02:14:53 +0000
Subject: tcp: gro: add checksuming helpers

skb with CHECKSUM_NONE cant currently be handled by GRO, and
we notice this deep in GRO stack in tcp[46]_gro_receive()

But there are cases where GRO can be a benefit, even with a lack
of checksums.

This preliminary work is needed to add GRO support
to tunnels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 385eb79cf6aa..75735c9a6a9d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2803,6 +2803,8 @@ void tcp4_proc_exit(void)
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
 	const struct iphdr *iph = skb_gro_network_header(skb);
+	__wsum wsum;
+	__sum16 sum;
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
@@ -2811,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
 		}
-
-		/* fall through */
-	case CHECKSUM_NONE:
+flush:
 		NAPI_GRO_CB(skb)->flush = 1;
 		return NULL;
+
+	case CHECKSUM_NONE:
+		wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+					  skb_gro_len(skb), IPPROTO_TCP, 0);
+		sum = csum_fold(skb_checksum(skb,
+					     skb_gro_offset(skb),
+					     skb_gro_len(skb),
+					     wsum));
+		if (sum)
+			goto flush;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		break;
 	}
 
 	return tcp_gro_receive(head, skb);
-- 
cgit v1.2.1


From 60769a5dcd8755715c7143b4571d5c44f01796f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 27 Sep 2012 02:48:50 +0000
Subject: ipv4: gre: add GRO capability

Add GRO capability to IPv4 GRE tunnels, using the gro_cells
infrastructure.

Tested using IPv4 and IPv6 TCP traffic inside this tunnel, and
checking GRO is building large packets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index ef0b861ce044..b1871d993d22 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -740,8 +740,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		netif_rx(skb);
-
+		gro_cells_receive(&tunnel->gro_cells, skb);
 		return 0;
 	}
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -1319,6 +1318,9 @@ static const struct net_device_ops ipgre_netdev_ops = {
 
 static void ipgre_dev_free(struct net_device *dev)
 {
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	gro_cells_destroy(&tunnel->gro_cells);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1350,6 +1352,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel;
 	struct iphdr *iph;
+	int err;
 
 	tunnel = netdev_priv(dev);
 	iph = &tunnel->parms.iph;
@@ -1376,6 +1379,12 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	err = gro_cells_init(&tunnel->gro_cells, dev);
+	if (err) {
+		free_percpu(dev->tstats);
+		return err;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.1


From 9fbef059d63cbc3249a3ddfd9a12cb511c8be55e Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 1 Oct 2012 05:21:14 +0000
Subject: gre: fix sparse warning

Use be16 consistently when looking at flags.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b1871d993d22..7240f8e2dd45 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -222,7 +222,7 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 
 /* Does key in tunnel parameters match packet */
 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
-			    __u32 flags, __be32 key)
+			    __be16 flags, __be32 key)
 {
 	if (p->i_flags & GRE_KEY) {
 		if (flags & GRE_KEY)
@@ -237,7 +237,7 @@ static bool ipgre_key_match(const struct ip_tunnel_parm *p,
 
 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 					     __be32 remote, __be32 local,
-					     __u32 flags, __be32 key,
+					     __be16 flags, __be32 key,
 					     __be16 gre_proto)
 {
 	struct net *net = dev_net(dev);
-- 
cgit v1.2.1


From 193ba924524e6afe192217982b2c2d67e4715d33 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 1 Oct 2012 12:32:34 +0000
Subject: igmp: export symbol ip_mc_leave_group

Needed for VXLAN.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3479b98a00a7..736ab70fd179 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1904,6 +1904,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 	rtnl_unlock();
 	return ret;
 }
+EXPORT_SYMBOL(ip_mc_leave_group);
 
 int ip_mc_source(int add, int omode, struct sock *sk, struct
 	ip_mreq_source *mreqs, int ifindex)
-- 
cgit v1.2.1


From f4ef85bbda96324785097356336bc79cdd37db0a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 4 Oct 2012 01:25:26 +0000
Subject: ipv4: add a fib_type to fib_info

commit d2d68ba9fe8 (ipv4: Cache input routes in fib_info nexthops.)
introduced a regression for forwarding.

This was hard to reproduce but the symptom was that packets were
delivered to local host instead of being forwarded.

David suggested to add fib_type to fib_info so that we dont
inadvertently share same fib_info for different purposes.

With help from Julian Anastasov who provided very helpful
hints, reproduced here :

<quote>
        Can it be a problem related to fib_info reuse
from different routes. For example, when local IP address
is created for subnet we have:

broadcast 192.168.0.255 dev DEV  proto kernel  scope link  src
192.168.0.1
192.168.0.0/24 dev DEV  proto kernel  scope link  src 192.168.0.1
local 192.168.0.1 dev DEV  proto kernel  scope host  src 192.168.0.1

        The "dev DEV  proto kernel  scope link  src 192.168.0.1" is
a reused fib_info structure where we put cached routes.
The result can be same fib_info for 192.168.0.255 and
192.168.0.0/24. RTN_BROADCAST is cached only for input
routes. Incoming broadcast to 192.168.0.255 can be cached
and can cause problems for traffic forwarded to 192.168.0.0/24.
So, this patch should solve the problem because it
separates the broadcast from unicast traffic.

        And the ip_route_input_slow caching will work for
local and broadcast input routes (above routes 1 and 3) just
because they differ in scope and use different fib_info.

</quote>

Many thanks to Chris Clayton for his patience and help.

Reported-by: Chris Clayton <chris2553@googlemail.com>
Bisected-by: Chris Clayton <chris2553@googlemail.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Tested-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3509065e409a..267753060ffc 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -314,6 +314,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 		    nfi->fib_scope == fi->fib_scope &&
 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
 		    nfi->fib_priority == fi->fib_priority &&
+		    nfi->fib_type == fi->fib_type &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(u32) * RTAX_MAX) == 0 &&
 		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
@@ -833,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_flags = cfg->fc_flags;
 	fi->fib_priority = cfg->fc_priority;
 	fi->fib_prefsrc = cfg->fc_prefsrc;
+	fi->fib_type = cfg->fc_type;
 
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
-- 
cgit v1.2.1