Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1699 commits) bnx2/bnx2x: Unsupported Ethtool operations should return -EINVAL. vlan: Calling vlan_hwaccel_do_receive() is always valid. tproxy: use the interface primary IP address as a default value for --on-ip tproxy: added IPv6 support to the socket match cxgb3: function namespace cleanup tproxy: added IPv6 support to the TPROXY target tproxy: added IPv6 socket lookup function to nf_tproxy_core be2net: Changes to use only priority codes allowed by f/w tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled tproxy: added tproxy sockopt interface in the IPV6 layer tproxy: added udp6_lib_lookup function tproxy: added const specifiers to udp lookup functions tproxy: split off ipv6 defragmentation to a separate module l2tp: small cleanup nf_nat: restrict ICMP translation for embedded header can: mcp251x: fix generation of error frames can: mcp251x: fix endless loop in interrupt handler if CANINTF_MERRF is set can-raw: add msg_flags to distinguish local traffic 9p: client code cleanup rds: make local functions/variables static ... Fix up conflicts in net/core/dev.c, drivers/net/pcmcia/smc91c92_cs.c and drivers/net/wireless/ath/ath9k/debug.c as per David
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-23 11:47:02 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-23 11:47:02 -0700
commit: 5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0 (patch)
tree: 7851ef1c93aa1aba7ef327ca4b75fd35e6d10f29 /net/netfilter
parent: 02f36038c568111ad4fc433f6fa760ff5e38fab4 (diff)
parent: ec37a48d1d16c30b655ac5280209edf52a6775d4 (diff)
download: talos-op-linux-5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0.tar.gz
talos-op-linux-5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0.zip
29 files changed, 2993 insertions, 1288 deletions
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index fdaec7daff1d..85dabb86be6f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
 
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		nf_unregister_hook(&reg[i]);
+	while (n-- > 0)
+		nf_unregister_hook(&reg[n]);
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 46a77d5c3887..a22dac227055 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
 #
 menuconfig IP_VS
 	tristate "IP virtual server support"
-	depends on NET && INET && NETFILTER && NF_CONNTRACK
+	depends on NET && INET && NETFILTER
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
 
 config	IP_VS_FTP
   	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP && NF_NAT
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+	select IP_VS_NFCT
 	---help---
 	  FTP is a protocol that transfers IP address and/or port number in
 	  the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,19 @@ config	IP_VS_FTP
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_NFCT
+	bool "Netfilter connection tracking"
+	depends on NF_CONNTRACK
+	---help---
+	  The Netfilter connection tracking support allows the IPVS
+	  connection state to be exported to the Netfilter framework
+	  for filtering purposes.
+
+config	IP_VS_PE_SIP
+	tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+	depends on NF_CONNTRACK_SIP
+	---help---
+	  Allow persistence based on the SIP Call-ID
+
 endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066e..34ee602ddb66 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
 
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o 				   \
-		$(ip_vs_proto-objs-y)
+		ip_vs_est.o ip_vs_proto.o ip_vs_pe.o			   \
+		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 
 # IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index e76f87f4aca8..a475edee0912 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
 		goto out;
 
 	list_add(&inc->a_list, &app->incs_list);
-	IP_VS_DBG(9, "%s application %s:%u registered\n",
-		  pp->name, inc->name, inc->port);
+	IP_VS_DBG(9, "%s App %s:%u registered\n",
+		  pp->name, inc->name, ntohs(inc->port));
 
 	return 0;
 
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
 		pp->unregister_app(inc);
 
 	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-		  pp->name, inc->name, inc->port);
+		  pp->name, inc->name, ntohs(inc->port));
 
 	list_del(&inc->a_list);
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b71c69a2db13..e9adecdc8ca4 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
 		& ip_vs_conn_tab_mask;
 }
 
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+					     bool inverse)
+{
+	const union nf_inet_addr *addr;
+	__be16 port;
+
+	if (p->pe_data && p->pe->hashkey_raw)
+		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+			ip_vs_conn_tab_mask;
+
+	if (likely(!inverse)) {
+		addr = p->caddr;
+		port = p->cport;
+	} else {
+		addr = p->vaddr;
+		port = p->vport;
+	}
+
+	return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn_param p;
+
+	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+			      NULL, 0, &p);
+
+	if (cp->dest && cp->dest->svc->pe) {
+		p.pe = cp->dest->svc->pe;
+		p.pe_data = cp->pe_data;
+		p.pe_data_len = cp->pe_data_len;
+	}
+
+	return ip_vs_conn_hashkey_param(&p, false);
+}
 
 /*
  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 		return 0;
 
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -218,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
- *	s_addr, s_port: pkt source address (foreign host)
- *	d_addr, d_port: pkt dest address (load balancer)
+ *	p->caddr, p->cport: pkt source address (foreign host)
+ *	p->vaddr, p->vport: pkt dest address (load balancer)
  */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ct_read_unlock(hash);
@@ -251,99 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
 	return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	struct ip_vs_conn *cp;
 
-	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-					 d_port);
+	cp = __ip_vs_conn_in_get(p);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+		struct ip_vs_conn_param cport_zero_p = *p;
+		cport_zero_p.cport = 0;
+		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	}
 
 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+			    const struct ip_vs_iphdr *iph,
+			    unsigned int proto_off, int inverse,
+			    struct ip_vs_conn_param *p)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return 1;
+
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+				      &iph->daddr, pptr[1], p);
+	else
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+				      &iph->saddr, pptr[0], p);
+	return 0;
+}
+
 struct ip_vs_conn *
 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
 			struct ip_vs_protocol *pp,
 			const struct ip_vs_iphdr *iph,
 			unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
+	return ip_vs_conn_in_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 
 /* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		if (p->pe_data && p->pe->ct_match) {
+			if (p->pe->ct_match(p, cp))
+				goto out;
+			continue;
+		}
+
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
-		     * d_addr is a fwmark */
-		    ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
-		                     d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
+		     * p->vaddr is a fwmark */
+		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+				     p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
+		    p->protocol == cp->protocol)
 			goto out;
-		}
 	}
 	cp = NULL;
 
   out:
+	if (cp)
+		atomic_inc(&cp->refcnt);
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *	s_addr, s_port: pkt source address (inside host)
- *	d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ *	p->caddr, p->cport: pkt source address (inside host)
+ *	p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+	hash = ip_vs_conn_hashkey_param(p, true);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-		    d_port == cp->cport && s_port == cp->dport &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ret = cp;
@@ -371,9 +418,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      ret ? "hit" : "not hit");
 
 	return ret;
@@ -385,20 +432,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
 			 const struct ip_vs_iphdr *iph,
 			 unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
+	return ip_vs_conn_out_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
@@ -505,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 static inline void
 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 {
+	unsigned int conn_flags;
+
 	/* if dest is NULL, then return directly */
 	if (!dest)
 		return;
@@ -512,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 	/* Increase the refcnt counter of the dest */
 	atomic_inc(&dest->refcnt);
 
+	conn_flags = atomic_read(&dest->conn_flags);
+	if (cp->protocol != IPPROTO_UDP)
+		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
 	/* Bind with the destination and its corresponding transmitter */
-	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+	if (cp->flags & IP_VS_CONN_F_SYNC) {
 		/* if the connection is not template and is created
 		 * by sync, preserve the activity flag.
 		 */
-		cp->flags |= atomic_read(&dest->conn_flags) &
-			     (~IP_VS_CONN_F_INACTIVE);
-	else
-		cp->flags |= atomic_read(&dest->conn_flags);
+		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+		/* connections inherit forwarding method from dest */
+		cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+	}
+	cp->flags |= conn_flags;
 	cp->dest = dest;
 
 	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -717,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
 		if (cp->control)
 			ip_vs_control_del(cp);
 
+		if (cp->flags & IP_VS_CONN_F_NFCT)
+			ip_vs_conn_drop_conntrack(cp);
+
+		kfree(cp->pe_data);
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
@@ -751,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  *	Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -767,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-	cp->af		   = af;
-	cp->protocol	   = proto;
-	ip_vs_addr_copy(af, &cp->caddr, caddr);
-	cp->cport	   = cport;
-	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-	cp->vport	   = vport;
+	cp->af		   = p->af;
+	cp->protocol	   = p->protocol;
+	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	cp->cport	   = p->cport;
+	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	cp->vport	   = p->vport;
 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
-	ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+		cp->pe_data = p->pe_data;
+		cp->pe_data_len = p->pe_data_len;
+	}
 	spin_lock_init(&cp->lock);
 
 	/*
@@ -803,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	/* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
+	if (p->af == AF_INET6)
 		ip_vs_bind_xmit_v6(cp);
 	else
 #endif
@@ -812,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 	if (unlikely(pp && atomic_read(&pp->appcnt)))
 		ip_vs_bind_app(cp, pp);
 
+	/*
+	 * Allow conntrack to be preserved. By default, conntrack
+	 * is created and destroyed for every packet.
+	 * Sometimes keeping conntrack can be useful for
+	 * IP_VS_CONN_F_ONE_PACKET too.
+	 */
+
+	if (ip_vs_conntrack_enabled())
+		cp->flags |= IP_VS_CONN_F_NFCT;
+
 	/* Hash it in the ip_vs_conn_tab finally */
 	ip_vs_conn_hash(cp);
 
 	return cp;
 }
 
-
 /*
  *	/proc/net/ip_vs_conn entries
  */
@@ -834,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (pos-- == 0) {
 				seq->private = &ip_vs_conn_tab[idx];
-				return cp;
+			return cp;
 			}
 		}
 		ct_read_unlock_bh(idx);
@@ -891,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+		size_t len = 0;
+
+		if (cp->dest && cp->pe_data &&
+		    cp->dest->svc->pe->show_pe_data) {
+			pe_data[0] = ' ';
+			len = strlen(cp->dest->svc->pe->name);
+			memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+			pe_data[len + 1] = ' ';
+			len += 2;
+			len += cp->dest->svc->pe->show_pe_data(cp,
+							       pe_data + len);
+		}
+		pe_data[len] = '\0';
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+				"%pI6 %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				&cp->daddr.in6, ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 		else
 #endif
 			seq_printf(seq,
 				"%-3s %08X %04X %08X %04X"
-				" %08X %04X %-11s %7lu\n",
+				" %08X %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 	}
 	return 0;
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4c2f89df5cce..b4e51e9c5a04 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -40,6 +40,7 @@
 #include <net/udp.h>
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>
+#include <net/ip6_checksum.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -47,6 +48,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
 #endif
 
 #include <net/ip_vs.h>
@@ -175,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+			      struct sk_buff *skb, int protocol,
+			      const union nf_inet_addr *caddr, __be16 cport,
+			      const union nf_inet_addr *vaddr, __be16 vport,
+			      struct ip_vs_conn_param *p)
+{
+	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+	p->pe = svc->pe;
+	if (p->pe && p->pe->fill_param)
+		p->pe->fill_param(p, skb);
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -185,15 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-		    const struct sk_buff *skb,
+		    struct sk_buff *skb,
 		    __be16 ports[2])
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	__be16  dport;			/* destination port to forward */
-	__be16  flags;
+	__be16 dport = 0;		/* destination port to forward */
+	unsigned int flags;
+	struct ip_vs_conn_param param;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
 
@@ -226,120 +241,75 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 	 * is created for other persistent services.
 	 */
-	if (ports[1] == svc->port) {
-		/* Check if a template already exists */
-		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, ports[1]);
-		else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * No template found or the dest of the connection
-			 * template is not available.
-			 */
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template like <protocol,caddr,0,
-			 * vaddr,vport,daddr,dport> for non-ftp service,
-			 * and <protocol,caddr,0,vaddr,0,daddr,0>
-			 * for ftp service.
+	{
+		int protocol = iph.protocol;
+		const union nf_inet_addr *vaddr = &iph.daddr;
+		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+		__be16 vport = 0;
+
+		if (ports[1] == svc->port) {
+			/* non-FTP template:
+			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+			 * FTP template:
+			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr,
-						    ports[1],
-						    &dest->addr, dest->port,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
+				vport = ports[1];
 		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = dest->port;
-	} else {
-		/*
-		 * Note: persistent fwmark-based services and persistent
-		 * port zero service are handled here.
-		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-		 */
-		if (svc->fwmark) {
-			union nf_inet_addr fwmark = {
-				.ip = htonl(svc->fwmark)
-			};
-
-			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-					     &fwmark, 0);
-		} else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * If it is not persistent port zero, return NULL,
-			 * otherwise create a connection template.
+			/* Note: persistent fwmark-based services and
+			 * persistent port zero service are handled here.
+			 * fwmark template:
+			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+			 * port zero template:
+			 * <protocol,caddr,0,vaddr,0,daddr,0>
 			 */
-			if (svc->port)
-				return NULL;
-
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
+			if (svc->fwmark) {
+				protocol = IPPROTO_IP;
+				vaddr = &fwmark;
 			}
+		}
+		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+					      vaddr, vport, &param);
+	}
 
-			/*
-			 * Create a template according to the service
-			 */
-			if (svc->fwmark) {
-				union nf_inet_addr fwmark = {
-					.ip = htonl(svc->fwmark)
-				};
-
-				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-						    &snet, 0,
-						    &fwmark, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			} else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
+	/* Check if a template already exists */
+	ct = ip_vs_ct_in_get(&param);
+	if (!ct || !ip_vs_check_template(ct)) {
+		/* No template found or the dest of the connection
+		 * template is not available.
+		 */
+		dest = svc->scheduler->schedule(svc, skb);
+		if (!dest) {
+			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			kfree(param.pe_data);
+			return NULL;
 		}
-		dport = ports[1];
+
+		if (ports[1] == svc->port && svc->port != FTPPORT)
+			dport = dest->port;
+
+		/* Create a template
+		 * This adds param.pe_data to the template,
+		 * and thus param.pe_data will be destroyed
+		 * when the template expires */
+		ct = ip_vs_conn_new(&param, &dest->addr, dport,
+				    IP_VS_CONN_F_TEMPLATE, dest);
+		if (ct == NULL) {
+			kfree(param.pe_data);
+			return NULL;
+		}
+
+		ct->timeout = svc->timeout;
+	} else {
+		/* set destination with the found template */
+		dest = ct->dest;
+		kfree(param.pe_data);
 	}
 
+	dport = ports[1];
+	if (dport == svc->port && dest->port)
+		dport = dest->port;
+
 	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 		 && iph.protocol == IPPROTO_UDP)?
 		IP_VS_CONN_F_ONE_PACKET : 0;
@@ -347,12 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, ports[0],
-			    &iph.daddr, ports[1],
-			    &dest->addr, dport,
-			    flags,
-			    dest);
+	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+			      &iph.daddr, ports[1], &param);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -376,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+	       struct ip_vs_protocol *pp, int *ignored)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
-	__be16 _ports[2], *pptr, flags;
+	__be16 _ports[2], *pptr;
+	unsigned int flags;
 
+	*ignored = 1;
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
 	/*
+	 * FTPDATA needs this check when using local real server.
+	 * Never schedule Active FTPDATA connections from real server.
+	 * For LVS-NAT they must be already created. For other methods
+	 * with persistence the connection is created on SYN+ACK.
+	 */
+	if (pptr[0] == FTPDATA) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling FTPDATA");
+		return NULL;
+	}
+
+	/*
+	 * Do not schedule replies from local real server. It is risky
+	 * for fwmark services but mostly for persistent services.
+	 */
+	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling reply for existing connection");
+		__ip_vs_conn_put(cp);
+		return NULL;
+	}
+
+	/*
 	 *    Persistent service
 	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+		*ignored = 0;
 		return ip_vs_sched_persist(svc, skb, pptr);
+	}
 
 	/*
 	 *    Non-persistent service
@@ -405,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		return NULL;
 	}
 
+	*ignored = 0;
+
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -418,14 +417,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, pptr[0],
-			    &iph.daddr, pptr[1],
-			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    flags,
-			    dest);
-	if (cp == NULL)
-		return NULL;
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+				      pptr[0], &iph.daddr, pptr[1], &p);
+		cp = ip_vs_conn_new(&p, &dest->addr,
+				    dest->port ? dest->port : pptr[1],
+				    flags, dest);
+		if (!cp)
+			return NULL;
+	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -472,23 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
-		__u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
-				iph.protocol == IPPROTO_UDP)?
-				IP_VS_CONN_F_ONE_PACKET : 0;
+		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+				      iph.protocol == IPPROTO_UDP)?
+				      IP_VS_CONN_F_ONE_PACKET : 0;
 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
 		ip_vs_service_put(svc);
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-		cp = ip_vs_conn_new(svc->af, iph.protocol,
-				    &iph.saddr, pptr[0],
-				    &iph.daddr, pptr[1],
-				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS | flags,
-				    NULL);
-		if (cp == NULL)
-			return NF_DROP;
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(svc->af, iph.protocol,
+					      &iph.saddr, pptr[0],
+					      &iph.daddr, pptr[1], &p);
+			cp = ip_vs_conn_new(&p, &daddr, 0,
+					    IP_VS_CONN_F_BYPASS | flags,
+					    NULL);
+			if (!cp)
+				return NF_DROP;
+		}
 
 		/* statistics */
 		ip_vs_in_stats(cp, skb);
@@ -526,9 +530,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 	 */
 #ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
+	if (svc->af == AF_INET6) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-	else
+	} else
 #endif
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -540,6 +549,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 }
 
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+	if (NF_INET_LOCAL_IN == hooknum)
+		return IP_DEFRAG_VS_IN;
+	if (NF_INET_FORWARD == hooknum)
+		return IP_DEFRAG_VS_FWD;
+	return IP_DEFRAG_VS_OUT;
+}
+
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
 	int err = ip_defrag(skb, user);
@@ -600,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered outgoing ICMP");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered incoming ICMP");
 }
 
@@ -637,17 +655,21 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	}
 
 	/* And finally the ICMP checksum */
-	icmph->icmp6_cksum = 0;
-	/* TODO IPv6: is this correct for ICMPv6? */
-	ip_vs_checksum_complete(skb, icmp_offset);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+					      skb->len - icmp_offset,
+					      IPPROTO_ICMPV6, 0);
+	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+	skb->ip_summed = CHECKSUM_PARTIAL;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered outgoing ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered outgoing ICMPv6");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered incoming ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered incoming ICMPv6");
 }
 #endif
 
@@ -688,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
 		ip_vs_nat_icmp(skb, pp, cp, 1);
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+			goto out;
+	} else
+#endif
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto out;
+
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
 	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
 	verdict = NF_ACCEPT;
 
 out:
@@ -705,7 +742,8 @@ out:
  *	Find any that might be relevant, check against existing connections.
  *	Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+			  unsigned int hooknum)
 {
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
@@ -720,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -763,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking outgoing ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -779,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+			     unsigned int hooknum)
 {
 	struct ipv6hdr *iph;
 	struct icmp6hdr	_icmph, *ic;
@@ -795,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -838,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking outgoing ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -886,7 +927,7 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		struct ip_vs_conn *cp, int ihl)
 {
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
 	if (!skb_make_writable(skb, ihl))
 		goto drop;
@@ -905,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		ip_send_check(ip_hdr(skb));
 	}
 
+	/*
+	 * nf_iterate does not expect change in the skb->dst->dev.
+	 * It looks like it is not fatal to enable this code for hooks
+	 * where our handlers are at the end of the chain list and
+	 * when all next handlers use skb->dst->dev and not outdev.
+	 * It will definitely route properly the inout NAT traffic
+	 * when multiple paths are used.
+	 */
+
 	/* For policy routing, packets originating from this
 	 * machine itself may be routed differently to packets
 	 * passing through.  We want this packet to be routed as
@@ -913,21 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	 */
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (ip6_route_me_harder(skb) != 0)
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
 			goto drop;
 	} else
 #endif
-		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto drop;
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-	ip_vs_update_conntrack(skb, cp, 0);
-	ip_vs_conn_put(cp);
-
 	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
+	ip_vs_conn_put(cp);
 
 	LeaveFunction(11);
 	return NF_ACCEPT;
@@ -935,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 drop:
 	ip_vs_conn_put(cp);
 	kfree_skb(skb);
+	LeaveFunction(11);
 	return NF_STOLEN;
 }
 
 /*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
  *	Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-	  const struct net_device *in, const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int af;
 
 	EnterFunction(11);
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
+
+	if (unlikely(!skb_dst(skb)))
+		return NF_ACCEPT;
+
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp_v6(skb, &related,
+							hooknum);
 
 			if (related)
 				return verdict;
@@ -972,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_out_icmp(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -986,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	/* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-			if (related)
-				return verdict;
-
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+			if (ip_vs_gather_frags_v6(skb,
+						  ip_vs_defrag_user(hooknum)))
+				return NF_STOLEN;
 		}
+
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 	} else
 #endif
 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 			     !pp->dont_defrag)) {
-			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+			if (ip_vs_gather_frags(skb,
+					       ip_vs_defrag_user(hooknum)))
 				return NF_STOLEN;
 
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1009,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	 */
 	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-	if (unlikely(!cp)) {
-		if (sysctl_ip_vs_nat_icmp_send &&
-		    (pp->protocol == IPPROTO_TCP ||
-		     pp->protocol == IPPROTO_UDP ||
-		     pp->protocol == IPPROTO_SCTP)) {
-			__be16 _ports[2], *pptr;
-
-			pptr = skb_header_pointer(skb, iph.len,
-						  sizeof(_ports), _ports);
-			if (pptr == NULL)
-				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(af, iph.protocol,
-						      &iph.saddr,
-						      pptr[0])) {
-				/*
-				 * Notify the real server: there is no
-				 * existing entry if it is not RST
-				 * packet or not TCP packet.
-				 */
-				if ((iph.protocol != IPPROTO_TCP &&
-				     iph.protocol != IPPROTO_SCTP)
-				     || ((iph.protocol == IPPROTO_TCP
-					  && !is_tcp_reset(skb, iph.len))
-					 || (iph.protocol == IPPROTO_SCTP
-						&& !is_sctp_abort(skb,
-							iph.len)))) {
+	if (likely(cp))
+		return handle_response(af, skb, pp, cp, iph.len);
+	if (sysctl_ip_vs_nat_icmp_send &&
+	    (pp->protocol == IPPROTO_TCP ||
+	     pp->protocol == IPPROTO_UDP ||
+	     pp->protocol == IPPROTO_SCTP)) {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, iph.len,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			return NF_ACCEPT;	/* Not for me */
+		if (ip_vs_lookup_real_service(af, iph.protocol,
+					      &iph.saddr,
+					      pptr[0])) {
+			/*
+			 * Notify the real server: there is no
+			 * existing entry if it is not RST
+			 * packet or not TCP packet.
+			 */
+			if ((iph.protocol != IPPROTO_TCP &&
+			     iph.protocol != IPPROTO_SCTP)
+			     || ((iph.protocol == IPPROTO_TCP
+				  && !is_tcp_reset(skb, iph.len))
+				 || (iph.protocol == IPPROTO_SCTP
+					&& !is_sctp_abort(skb,
+						iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-					if (af == AF_INET6)
-						icmpv6_send(skb,
-							    ICMPV6_DEST_UNREACH,
-							    ICMPV6_PORT_UNREACH,
-							    0);
-					else
+				if (af == AF_INET6) {
+					struct net *net =
+						dev_net(skb_dst(skb)->dev);
+
+					if (!skb->dev)
+						skb->dev = net->loopback_dev;
+					icmpv6_send(skb,
+						    ICMPV6_DEST_UNREACH,
+						    ICMPV6_PORT_UNREACH,
+						    0);
+				} else
 #endif
-						icmp_send(skb,
-							  ICMP_DEST_UNREACH,
-							  ICMP_PORT_UNREACH, 0);
-					return NF_DROP;
-				}
+					icmp_send(skb,
+						  ICMP_DEST_UNREACH,
+						  ICMP_PORT_UNREACH, 0);
+				return NF_DROP;
 			}
 		}
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
-		return NF_ACCEPT;
 	}
+	IP_VS_DBG_PKT(12, af, pp, skb, 0,
+		      "ip_vs_out: packet continues traversal as normal");
+	return NF_ACCEPT;
+}
+
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET6);
+}
 
-	return handle_response(af, skb, pp, cp, iph.len);
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
 }
 
+#endif
 
 /*
  *	Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1081,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1125,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking incoming ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -1159,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI4 but in FORWARD\n",
+			  __func__, &skb_rtable(skb)->rt_dst);
+		verdict = NF_DROP;
+	}
 
   out:
 	__ip_vs_conn_put(cp);
@@ -1180,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_protocol *pp;
 	unsigned int offset, verdict;
 	union nf_inet_addr snet;
+	struct rt6_info *rt;
 
 	*related = 1;
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-					       IP_DEFRAG_VS_IN :
-					       IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1230,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking incoming ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -1258,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	    IPPROTO_SCTP == cih->nexthdr)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    (rt = (struct rt6_info *) skb_dst(skb)) &&
+	    rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI6 but in FORWARD\n",
+			  __func__, &rt->rt6i_dst);
+		verdict = NF_DROP;
+	}
 
 	__ip_vs_conn_put(cp);
 
@@ -1272,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *	and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-	 const struct net_device *in, const struct net_device *out,
-	 int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ret, restart, af, pkts;
+	int ret, restart, pkts;
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+	/* Already marked as IPVS request or reply? */
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
 
 	/*
-	 *	Big tappo: only PACKET_HOST, including loopback for local client
-	 *	Don't handle local packets on IPv6 for now
+	 *	Big tappo:
+	 *	- remote client: only PACKET_HOST
+	 *	- route: used for struct net when skb->dev is unset
 	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST)) {
-		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-			      skb->pkt_type,
-			      iph.protocol,
-			      IP_VS_DBG_ADDR(af, &iph.daddr));
+	if (unlikely((skb->pkt_type != PACKET_HOST &&
+		      hooknum != NF_INET_LOCAL_OUT) ||
+		     !skb_dst(skb))) {
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+			      " ignored in hook %u\n",
+			      skb->pkt_type, iph.protocol,
+			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
 		return NF_ACCEPT;
 	}
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -1309,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -1329,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(!cp)) {
 		int v;
 
-		/* For local client packets, it could be a response */
-		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-		if (cp)
-			return handle_response(af, skb, pp, cp, iph.len);
-
 		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
 			return v;
 	}
 
 	if (unlikely(!cp)) {
 		/* sorry, all this trouble for a no-hit :) */
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
+		IP_VS_DBG_PKT(12, af, pp, skb, 0,
+			      "ip_vs_in: packet continues traversal as normal");
 		return NF_ACCEPT;
 	}
 
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
 
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1381,8 +1540,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
-			(atomic_read(&cp->in_pkts) %
-			 sysctl_ip_vs_sync_threshold[1]
+			(pkts % sysctl_ip_vs_sync_threshold[1]
 			 == sysctl_ip_vs_sync_threshold[0])) ||
 				(cp->old_state != cp->state &&
 				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1393,7 +1551,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 		}
 	}
 
-	if (af == AF_INET &&
+	/* Keep this block last: TCP and others with pp->num_states <= 1 */
+	else if (af == AF_INET &&
 	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
@@ -1412,6 +1571,72 @@ out:
 	return ret;
 }
 
+/*
+ *	AF_INET handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *	AF_INET handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
+
 
 /*
  *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1452,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_reply4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
+	},
+	/* After mangle, schedule and forward local requests */
+	{
+		.hook		= ip_vs_local_request4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1476,27 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hook		= ip_vs_forward_icmp,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
 	},
 #ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_local_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
+	},
+	/* After mangle, schedule and forward local requests */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_request6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1504,8 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hook		= ip_vs_forward_icmp_v6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
 	},
 #endif
 };
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0f0c079c422a..5f5daa30b0af 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock);
 static DEFINE_RWLOCK(__ip_vs_rs_lock);
 
 /* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
 
 /* lock for drop entry handling */
 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
 int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
+int sysctl_ip_vs_snat_reroute = 1;
 
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -204,7 +208,7 @@ static void update_defense_level(void)
 	spin_unlock(&__ip_vs_droppacket_lock);
 
 	/* secure_tcp */
-	write_lock(&__ip_vs_securetcp_lock);
+	spin_lock(&ip_vs_securetcp_lock);
 	switch (sysctl_ip_vs_secure_tcp) {
 	case 0:
 		if (old_secure_tcp >= 2)
@@ -238,7 +242,7 @@ static void update_defense_level(void)
 	old_secure_tcp = sysctl_ip_vs_secure_tcp;
 	if (to_change >= 0)
 		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
-	write_unlock(&__ip_vs_securetcp_lock);
+	spin_unlock(&ip_vs_securetcp_lock);
 
 	local_bh_enable();
 }
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
  *	Get service by {proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    __be16 vport)
 {
 	unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
  *	Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
 		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+	svc = __ip_vs_service_find(af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+		svc = __ip_vs_service_find(af, protocol, vaddr, 0);
 	}
 
   out:
+	if (svc)
+		atomic_inc(&svc->usecnt);
 	read_unlock(&__ip_vs_svc_lock);
 
 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	dest->svc = svc;
 }
 
-static inline void
+static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = dest->svc;
 
 	dest->svc = NULL;
-	if (atomic_dec_and_test(&svc->refcnt))
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 }
 
 
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
  *	Update a destination in the given service
  */
 static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+		    struct ip_vs_dest_user_kern *udest, int add)
 {
 	int conn_flags;
 
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
-	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
-	/* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6) {
-		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-	} else
-#endif
-		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
+	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+	conn_flags |= IP_VS_CONN_F_INACTIVE;
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
 	} else {
 		/*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
+
+	spin_lock(&dest->dst_lock);
+	ip_vs_dst_reset(dest);
+	spin_unlock(&dest->dst_lock);
+
+	if (add)
+		ip_vs_new_estimator(&dest->stats);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	if (add) {
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+	}
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -843,7 +862,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 			return -EINVAL;
 	}
 
-	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
 	if (dest == NULL) {
 		pr_err("%s(): no memory.\n", __func__);
 		return -ENOMEM;
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	atomic_set(&dest->activeconns, 0);
 	atomic_set(&dest->inactconns, 0);
 	atomic_set(&dest->persistconns, 0);
-	atomic_set(&dest->refcnt, 0);
+	atomic_set(&dest->refcnt, 1);
 
 	INIT_LIST_HEAD(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
 	spin_lock_init(&dest->stats.lock);
-	__ip_vs_update_dest(svc, dest, udest);
-	ip_vs_new_estimator(&dest->stats);
+	__ip_vs_update_dest(svc, dest, udest, 1);
 
 	*dest_p = dest;
 
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
-		__ip_vs_update_dest(svc, dest, udest);
-
 		/*
 		 * Get the destination from the trash
 		 */
 		list_del(&dest->n_list);
 
-		ip_vs_new_estimator(&dest->stats);
-
-		write_lock_bh(&__ip_vs_svc_lock);
-
+		__ip_vs_update_dest(svc, dest, udest, 1);
+		ret = 0;
+	} else {
 		/*
-		 * Wait until all other svc users go away.
+		 * Allocate and initialize the dest structure
 		 */
-		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-		list_add(&dest->n_list, &svc->destinations);
-		svc->num_dests++;
-
-		/* call the update_service function of its scheduler */
-		if (svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-
-		write_unlock_bh(&__ip_vs_svc_lock);
-		return 0;
-	}
-
-	/*
-	 * Allocate and initialize the dest structure
-	 */
-	ret = ip_vs_new_dest(svc, udest, &dest);
-	if (ret) {
-		return ret;
+		ret = ip_vs_new_dest(svc, udest, &dest);
 	}
-
-	/*
-	 * Add the dest entry into the list
-	 */
-	atomic_inc(&dest->refcnt);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	list_add(&dest->n_list, &svc->destinations);
-	svc->num_dests++;
-
-	/* call the update_service function of its scheduler */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
 	LeaveFunction(2);
 
-	return 0;
+	return ret;
 }
 
 
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		return -ENOENT;
 	}
 
-	__ip_vs_update_dest(svc, dest, udest);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
+	__ip_vs_update_dest(svc, dest, udest, 0);
 	LeaveFunction(2);
 
 	return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 	 *  the destination into the trash.
 	 */
 	if (atomic_dec_and_test(&dest->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port));
 		ip_vs_dst_reset(dest);
 		/* simply decrease svc->refcnt here, let the caller check
 		   and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *	Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 *	Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
 
 	/* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	if (sched == NULL) {
 		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
 		ret = -ENOENT;
-		goto out_mod_dec;
+		goto out_err;
+	}
+
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
 	}
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1177,7 +1155,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 #endif
 
-	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
 	if (svc == NULL) {
 		IP_VS_DBG(1, "%s(): no memory\n", __func__);
 		ret = -ENOMEM;
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 
 	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->usecnt, 0);
 	atomic_set(&svc->refcnt, 0);
 
 	svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		goto out_err;
 	sched = NULL;
 
+	/* Bind the ct retriever */
+	ip_vs_bind_pe(svc, pe);
+	pe = NULL;
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	*svc_p = svc;
 	return 0;
 
-  out_err:
+ out_err:
 	if (svc != NULL) {
-		if (svc->scheduler)
-			ip_vs_unbind_scheduler(svc);
+		ip_vs_unbind_scheduler(svc);
 		if (svc->inc) {
 			local_bh_disable();
 			ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		kfree(svc);
 	}
 	ip_vs_scheduler_put(sched);
+	ip_vs_pe_put(pe);
 
-  out_mod_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 
@@ -1255,6 +1236,7 @@ static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
 	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 
 	/*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	}
 	old_sched = sched;
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out;
+		}
+		old_pe = pe;
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	/*
 	 * Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 * Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		}
 	}
 
+	old_pe = svc->pe;
+	if (pe != old_pe) {
+		ip_vs_unbind_pe(svc);
+		ip_vs_bind_pe(svc, pe);
+	}
+
   out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
   out:
-#endif
-
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
+	ip_vs_scheduler_put(old_sched);
+	ip_vs_pe_put(old_pe);
 	return ret;
 }
 
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
+	struct ip_vs_pe *old_pe;
+
+	pr_info("%s: enter\n", __func__);
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/* Unbind scheduler */
 	old_sched = svc->scheduler;
 	ip_vs_unbind_scheduler(svc);
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
+	ip_vs_scheduler_put(old_sched);
+
+	/* Unbind persistence engine */
+	old_pe = svc->pe;
+	ip_vs_unbind_pe(svc);
+	ip_vs_pe_put(old_pe);
 
 	/* Unbind app inc */
 	if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	if (atomic_read(&svc->refcnt) == 0)
+	if (atomic_read(&svc->refcnt) == 0) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 }
 
 /*
- *	Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
 {
-	if (svc == NULL)
-		return -EEXIST;
-
 	/*
 	 * Unhash it from the service table
 	 */
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 * Wait until all the svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	__ip_vs_del_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+	ip_vs_unlink_service(svc);
 
 	return 0;
 }
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+#ifdef CONFIG_IP_VS_NFCT
+	{
+		.procname	= "conntrack",
+		.data		= &sysctl_ip_vs_conntrack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "secure_tcp",
 		.data		= &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+	{
+		.procname	= "snat_reroute",
+		.data		= &sysctl_ip_vs_snat_reroute,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if 0
 	{
 		.procname	= "timeout_established",
@@ -2041,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 				  struct ip_vs_service_user *usvc_compat)
 {
+	memset(usvc, 0, sizeof(*usvc));
+
 	usvc->af		= AF_INET;
 	usvc->protocol		= usvc_compat->protocol;
 	usvc->addr.ip		= usvc_compat->addr;
@@ -2058,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
 				   struct ip_vs_dest_user *udest_compat)
 {
+	memset(udest, 0, sizeof(*udest));
+
 	udest->addr.ip		= udest_compat->addr;
 	udest->port		= udest_compat->port;
 	udest->conn_flags	= udest_compat->conn_flags;
@@ -2147,10 +2167,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
+		svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2189,9 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		ret = -EINVAL;
 	}
 
-	if (svc)
-		ip_vs_service_put(svc);
-
   out_unlock:
 	mutex_unlock(&__ip_vs_mutex);
   out_dec:
@@ -2284,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-					  get->port);
+		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+					   get->port);
 
 	if (svc) {
 		int count = 0;
@@ -2315,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 			}
 			count++;
 		}
-		ip_vs_service_put(svc);
 	} else
 		ret = -ESRCH;
 	return ret;
@@ -2436,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_get(AF_INET, entry->protocol,
-						  &addr, entry->port);
+			svc = __ip_vs_service_find(AF_INET, entry->protocol,
+						   &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
 				ret = -EFAULT;
-			ip_vs_service_put(svc);
 		} else
 			ret = -ESRCH;
 	}
@@ -2559,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
 	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_PENAME_MAXLEN },
 	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
 					    .len = sizeof(struct ip_vs_flags) },
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
@@ -2635,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	}
 
 	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	if (svc->pe)
+		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
 	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2711,10 +2730,12 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-				    struct nlattr *nla, int full_entry)
+				    struct nlattr *nla, int full_entry,
+				    struct ip_vs_service **ret_svc)
 {
 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
 	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+	struct ip_vs_service *svc;
 
 	/* Parse mandatory identifying service fields first */
 	if (nla == NULL ||
@@ -2750,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		usvc->fwmark = 0;
 	}
 
+	if (usvc->fwmark)
+		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+	else
+		svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+					   &usvc->addr, usvc->port);
+	*ret_svc = svc;
+
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
-		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
-		struct ip_vs_service *svc;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
 		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
 		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2768,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		nla_memcpy(&flags, nla_flags, sizeof(flags));
 
 		/* prefill flags from service if it already exists */
-		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-		else
-			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-						  &usvc->addr, usvc->port);
-		if (svc) {
+		if (svc)
 			usvc->flags = svc->flags;
-			ip_vs_service_put(svc);
-		} else
-			usvc->flags = 0;
 
 		/* set new flags from userland */
 		usvc->flags = (usvc->flags & ~flags.mask) |
 			      (flags.flags & flags.mask);
 		usvc->sched_name = nla_data(nla_sched);
+		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
 		usvc->timeout = nla_get_u32(nla_timeout);
 		usvc->netmask = nla_get_u32(nla_netmask);
 	}
@@ -2793,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
 	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
 	int ret;
 
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-	else
-		return __ip_vs_service_get(usvc.af, usvc.protocol,
-					   &usvc.addr, usvc.port);
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+	return ret ? ERR_PTR(ret) : svc;
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2894,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 
 nla_put_failure:
 	cb->args[0] = idx;
-	ip_vs_service_put(svc);
 
 out_err:
 	mutex_unlock(&__ip_vs_mutex);
@@ -3107,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	ret = ip_vs_genl_parse_service(&usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
-				       need_full_svc);
+				       need_full_svc, &svc);
 	if (ret)
 		goto out;
 
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
 	/* Unless we're adding a new service, the service must already exist */
 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
 		ret = -ESRCH;
@@ -3151,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		break;
 	case IPVS_CMD_DEL_SERVICE:
 		ret = ip_vs_del_service(svc);
+		/* do not use svc, it can be freed */
 		break;
 	case IPVS_CMD_NEW_DEST:
 		ret = ip_vs_add_dest(svc, &udest);
@@ -3169,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	if (svc)
-		ip_vs_service_put(svc);
 	mutex_unlock(&__ip_vs_mutex);
 
 	return ret;
@@ -3216,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 			goto out_err;
 		} else if (svc) {
 			ret = ip_vs_genl_fill_service(msg, svc);
-			ip_vs_service_put(svc);
 			if (ret)
 				goto nla_put_failure;
 		} else {
@@ -3385,6 +3390,16 @@ int __init ip_vs_control_init(void)
 
 	EnterFunction(2);
 
+	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+	}
+	smp_wmb();
+
 	ret = nf_register_sockopt(&ip_vs_sockopts);
 	if (ret) {
 		pr_err("cannot register sockopt.\n");
@@ -3403,15 +3418,6 @@ int __init ip_vs_control_init(void)
 
 	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
 
-	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-	}
-
 	ip_vs_new_estimator(&ip_vs_stats);
 
 	/* Hook the defense timer */
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7e9af5b76d9e..75455000ad1c 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
  *
  * Author:	Wouter Gadeyne
  *
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -58,16 +47,6 @@
 #define SERVER_STRING "227 Entering Passive Mode ("
 #define CLIENT_STRING "PORT "
 
-#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
-			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
-			(T)->dst.protonum
-
-#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
-			&((C)->vaddr.ip), ntohs((C)->vport), \
-			&((C)->daddr.ip), ntohs((C)->dport), \
-			(C)->protocol, (C)->state
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv;
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 {
+	/* We use connection tracking for the command connection */
+	cp->flags |= IP_VS_CONN_F_NFCT;
 	return 0;
 }
 
@@ -149,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 }
 
 /*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
-		      struct nf_conntrack_expect *exp)
-{
-	struct nf_conntrack_tuple *orig, new_reply;
-	struct ip_vs_conn *cp;
-
-	if (exp->tuple.src.l3num != PF_INET)
-		return;
-
-	/*
-	 * We assume that no NF locks are held before this callback.
-	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
-	 * expectations even if they use wildcard values, now we provide the
-	 * actual values from the newly created original conntrack direction.
-	 * The conntrack is confirmed when packet reaches IPVS hooks.
-	 */
-
-	/* RS->CLIENT */
-	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-				&orig->src.u3, orig->src.u.tcp.port,
-				&orig->dst.u3, orig->dst.u.tcp.port);
-	if (cp) {
-		/* Change reply CLIENT->RS to CLIENT->VS */
-		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		new_reply.dst.u3 = cp->vaddr;
-		new_reply.dst.u.tcp.port = cp->vport;
-		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-			  ", inout cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		goto alter;
-	}
-
-	/* CLIENT->VS */
-	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-			       &orig->src.u3, orig->src.u.tcp.port,
-			       &orig->dst.u3, orig->dst.u.tcp.port);
-	if (cp) {
-		/* Change reply VS->CLIENT to RS->CLIENT */
-		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		new_reply.src.u3 = cp->daddr;
-		new_reply.src.u.tcp.port = cp->dport;
-		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		goto alter;
-	}
-
-	IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-		  " - unknown expect\n",
-		  __func__, ct, ct->status, ARG_TUPLE(orig));
-	return;
-
-alter:
-	/* Never alter conntrack for non-NAT conns */
-	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
-		nf_conntrack_alter_reply(ct, &new_reply);
-	ip_vs_conn_put(cp);
-	return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
-		     struct ip_vs_conn *cp, u_int8_t proto,
-		     const __be16 *port, int from_rs)
-{
-	struct nf_conntrack_expect *exp;
-
-	BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
-	exp = nf_ct_expect_alloc(ct);
-	if (!exp)
-		return;
-
-	if (from_rs)
-		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-				  nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
-				  proto, port, &cp->cport);
-	else
-		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-				  nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
-				  proto, port, &cp->vport);
-
-	exp->expectfn = ip_vs_expect_callback;
-
-	IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
-		  __func__, ct, ARG_TUPLE(&exp->tuple));
-	nf_ct_expect_related(exp);
-	nf_ct_expect_put(exp);
-}
-
-/*
  * Look at outgoing ftp packets to catch the response to a PASV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
@@ -328,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, iph->protocol,
+					      &from, port, &cp->caddr, 0, &p);
+			n_cp = ip_vs_conn_out_get(&p);
+		}
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-					      &cp->caddr, 0,
-					      &cp->vaddr, port,
-					      &from, port,
-					      IP_VS_CONN_F_NO_CPORT,
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+					      0, &cp->vaddr, port, &p);
+			n_cp = ip_vs_conn_new(&p, &from, port,
+					      IP_VS_CONN_F_NO_CPORT |
+					      IP_VS_CONN_F_NFCT,
 					      cp->dest);
 			if (!n_cp)
 				return 0;
@@ -370,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
 						       start-data, end-start,
 						       buf, buf_len);
-			if (ret)
-				ip_vs_expect_related(skb, ct, n_cp,
-						     IPPROTO_TCP, NULL, 0);
+			if (ret) {
+				ip_vs_nfct_expect_related(skb, ct, n_cp,
+							  IPPROTO_TCP, 0, 0);
+				if (skb->ip_summed == CHECKSUM_COMPLETE)
+					skb->ip_summed = CHECKSUM_UNNECESSARY;
+				/* csum is updated */
+				ret = 1;
+			}
 		}
 
 		/*
@@ -479,21 +356,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		  ip_vs_proto_name(iph->protocol),
 		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
 
-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-				 &to, port,
-				 &cp->vaddr, htons(ntohs(cp->vport)-1));
-	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-				      &to, port,
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      0,
-				      cp->dest);
-		if (!n_cp)
-			return 0;
+				      &p);
+		n_cp = ip_vs_conn_in_get(&p);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(&p, &cp->daddr,
+					      htons(ntohs(cp->dport)-1),
+					      IP_VS_CONN_F_NFCT, cp->dest);
+			if (!n_cp)
+				return 0;
 
-		/* add its controller */
-		ip_vs_control_add(n_cp, cp);
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
 	}
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000000..4680647cd450
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>		Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>	Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+	    nf_ct_is_dying(ct))
+		return;
+
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return;
+
+	/* Alter reply only in original direction */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	if (outin) {
+		new_tuple.src.u3 = cp->daddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.src.u.tcp.port = cp->dport;
+	} else {
+		new_tuple.dst.u3 = cp->vaddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.dst.u.tcp.port = cp->vport;
+	}
+	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		  "ctinfo=%d, old reply=" FMT_TUPLE
+		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+		  __func__, ct, ct->status, ctinfo,
+		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+	struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+			      &orig->src.u3, orig->src.u.tcp.port,
+			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
+	cp = ip_vs_conn_out_get(&p);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(&p);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+			       struct ip_vs_conn *cp, u_int8_t proto,
+			       const __be16 port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			from_rs ? &cp->daddr : &cp->caddr,
+			from_rs ? &cp->caddr : &cp->vaddr,
+			proto, port ? &port : NULL,
+			from_rs ? &cp->cport : &cp->vport);
+
+	exp->expectfn = ip_vs_nfct_expect_callback;
+
+	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		__func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct nf_conntrack_tuple tuple;
+
+	if (!cp->cport)
+		return;
+
+	tuple = (struct nf_conntrack_tuple) {
+		.dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+	tuple.src.u3 = cp->caddr;
+	tuple.src.u.all = cp->cport;
+	tuple.src.l3num = cp->af;
+	tuple.dst.u3 = cp->vaddr;
+	tuple.dst.u.all = cp->vport;
+
+	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+		" for conn " FMT_CONN "\n",
+		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* Show what happens instead of calling nf_ct_kill() */
+		if (del_timer(&ct->timeout)) {
+			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+			if (ct->timeout.function)
+				ct->timeout.function(ct->timeout.data);
+		} else {
+			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+		}
+		nf_ct_put(ct);
+	} else {
+		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			__func__, ARG_TUPLE(&tuple));
+	}
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 000000000000..3414af70ee12
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+	svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+	svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+	struct ip_vs_pe *pe;
+
+	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+		  pe_name);
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+		/* Test and get the modules atomically */
+		if (pe->module &&
+		    !try_module_get(pe->module)) {
+			/* This pe is just deleted */
+			continue;
+		}
+		if (strcmp(pe_name, pe->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_pe_lock);
+			return pe;
+		}
+		if (pe->module)
+			module_put(pe->module);
+	}
+
+	spin_unlock_bh(&ip_vs_pe_lock);
+	return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+	struct ip_vs_pe *pe;
+
+	/* Search for the pe by name */
+	pe = ip_vs_pe_getbyname(name);
+
+	/* If pe not found, load the module and search again */
+	if (!pe) {
+		request_module("ip_vs_pe_%s", name);
+		pe = ip_vs_pe_getbyname(name);
+	}
+
+	return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+	if (pe && pe->module)
+		module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	struct ip_vs_pe *tmp;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	if (!list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] pe already linked\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Make sure that the pe with this name doesn't exist
+	 * in the pe list.
+	 */
+	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+		if (strcmp(tmp->name, pe->name) == 0) {
+			spin_unlock_bh(&ip_vs_pe_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] pe already existed "
+			       "in the system\n", __func__, pe->name);
+			return -EINVAL;
+		}
+	}
+	/* Add it into the d-linked pe list */
+	list_add(&pe->n_list, &ip_vs_pe);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	pr_info("[%s] pe registered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	spin_lock_bh(&ip_vs_pe_lock);
+	if (list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		pr_err("%s(): [%s] pe is not in the list. failed\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Remove it from the d-linked pe list */
+	list_del(&pe->n_list);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] pe unregistered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 000000000000..b8b4e9620f3e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,169 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+				    const char *callid, size_t callid_len,
+				    int *idx)
+{
+	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+	memcpy(buf + *idx, callid, len);
+	buf[*idx+len] = '\0';
+	*idx += len + 1;
+	return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)					\
+	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
+			 callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+		      unsigned int datalen,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	/* Find callid */
+	while (1) {
+		int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+					    SIP_HDR_CALL_ID, matchoff,
+					    matchlen);
+		if (ret > 0)
+			break;
+		if (!ret)
+			return 0;
+		dataoff += *matchoff;
+	}
+
+	/* Empty callid is useless */
+	if (!*matchlen)
+		return -EINVAL;
+
+	/* Too large is useless */
+	if (*matchlen > IP_VS_PEDATA_MAXLEN)
+		return -EINVAL;
+
+	/* SIP headers are always followed by a line terminator */
+	if (*matchoff + *matchlen == datalen)
+		return -EINVAL;
+
+	/* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+	 * RFC 3261 allows only CRLF, we support both. */
+	if (*(dptr + *matchoff + *matchlen) != '\r' &&
+	    *(dptr + *matchoff + *matchlen) != '\n')
+		return -EINVAL;
+
+	IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+		      IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+		      *matchlen);
+	return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+	struct ip_vs_iphdr iph;
+	unsigned int dataoff, datalen, matchoff, matchlen;
+	const char *dptr;
+
+	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+	/* Only useful with UDP */
+	if (iph.protocol != IPPROTO_UDP)
+		return -EINVAL;
+
+	/* No Data ? */
+	dataoff = iph.len + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return -EINVAL;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+
+	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+		return -EINVAL;
+
+	p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+	if (!p->pe_data)
+		return -ENOMEM;
+
+	/* N.B: pe_data is only set on success,
+	 * this allows fallback to the default persistence logic on failure
+	 */
+	memcpy(p->pe_data, dptr + matchoff, matchlen);
+	p->pe_data_len = matchlen;
+
+	return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+				  struct ip_vs_conn *ct)
+
+{
+	bool ret = 0;
+
+	if (ct->af == p->af &&
+	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+	    /* protocol should only be IPPROTO_IP if
+	     * d_addr is a fwmark */
+	    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			     p->vaddr, &ct->vaddr) &&
+	    ct->vport == p->vport &&
+	    ct->flags & IP_VS_CONN_F_TEMPLATE &&
+	    ct->protocol == p->protocol &&
+	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+		ret = 1;
+
+	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+				 u32 initval, bool inverse)
+{
+	return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+	memcpy(buf, cp->pe_data, cp->pe_data_len);
+	return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+	.name =			"sip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+	.fill_param =		ip_vs_sip_fill_param,
+	.ct_match =		ip_vs_sip_ct_match,
+	.hashkey_raw =		ip_vs_sip_hashkey_raw,
+	.show_pe_data =		ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+	return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+	unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654799fe..c53998390877 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 	else if (ih->frag_off & htons(IP_OFFSET))
 		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
 	else {
-		__be16 _ports[2], *pptr
-;
+		__be16 _ports[2], *pptr;
+
 		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
 			  const struct sk_buff *skb,
 			  int offset,
 			  const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
+	if (af == AF_INET6)
 		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
 	else
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc12fdd..3a0461117d3f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
 
 #define PORT_ISAKMP	500
 
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+			     int inverse, struct ip_vs_conn_param *p)
+{
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+	else
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->daddr, htons(PORT_ISAKMP),
+				      &iph->saddr, htons(PORT_ISAKMP), p);
+}
 
 static struct ip_vs_conn *
 ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->saddr,
-				       htons(PORT_ISAKMP),
-				       &iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->daddr,
-				       htons(PORT_ISAKMP),
-				       &iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_in_get(&p);
 	if (!cp) {
 		/*
 		 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 		    int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->saddr,
-					htons(PORT_ISAKMP),
-					&iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->daddr,
-					htons(PORT_ISAKMP),
-					&iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (!cp) {
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
 			      "%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	return 0;
 }
 
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct ipv6hdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
-		ah_esp_debug_packet_v6(pp, skb, offset, msg);
-	else
-#endif
-		ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
 	/* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 	.set_state_timeout =	NULL,
 };
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 };
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855cb006e..1ea96bcd342b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	if ((sch->type == SCTP_CID_INIT) &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				     &iph.daddr, sh->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -61,6 +63,7 @@ sctp_snat_handler(struct sk_buff *skb,
 {
 	sctp_sctphdr_t *sctph;
 	unsigned int sctphoff;
+	struct sk_buff *iter;
 	__be32 crc32;
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -89,8 +92,8 @@ sctp_snat_handler(struct sk_buff *skb,
 
 	/* Calculate the checksum */
 	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
-	for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
-		crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
 				          crc32);
 	crc32 = sctp_end_cksum(crc32);
 	sctph->checksum = crc32;
@@ -102,9 +105,9 @@ static int
 sctp_dnat_handler(struct sk_buff *skb,
 		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
-
 	sctp_sctphdr_t *sctph;
 	unsigned int sctphoff;
+	struct sk_buff *iter;
 	__be32 crc32;
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -133,8 +136,8 @@ sctp_dnat_handler(struct sk_buff *skb,
 
 	/* Calculate the checksum */
 	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
-	for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
-		crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
 					  crc32);
 	crc32 = sctp_end_cksum(crc32);
 	sctph->checksum = crc32;
@@ -145,9 +148,9 @@ sctp_dnat_handler(struct sk_buff *skb,
 static int
 sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 {
-	struct sk_buff *list = skb_shinfo(skb)->frag_list;
 	unsigned int sctphoff;
 	struct sctphdr *sh, _sctph;
+	struct sk_buff *iter;
 	__le32 cmp;
 	__le32 val;
 	__u32 tmp;
@@ -166,15 +169,15 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 	cmp = sh->checksum;
 
 	tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
-	for (; list; list = list->next)
-		tmp = sctp_update_cksum((__u8 *) list->data,
-					skb_headlen(list), tmp);
+	skb_walk_frags(skb, iter)
+		tmp = sctp_update_cksum((__u8 *) iter->data,
+					skb_headlen(iter), tmp);
 
 	val = sctp_end_cksum(tmp);
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 				"Failed checksum for");
 		return 0;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24de8592..f6c5200e2146 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;
 	}
 
+	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
 				     th->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		tcph->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 	else
 #endif
 	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 }
 
 
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
 							skb->len - tcphoff,
 							cp->protocol,
 							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust TCP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					    skb->len - tcphoff,
 					    ipv6_hdr(skb)->nexthdr,
 					    skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					      skb->len - tcphoff,
 					      ip_hdr(skb)->protocol,
 					      skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231b5d41..9d106a06bb0a 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		uhdr->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 	else
 #endif
 	uhdr->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 }
 
 
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
 		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
 							skb->csum);
 		if (udph->check == 0)
 			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, udph->check,
 			  (char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust UDP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+		udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						    skb->len - udphoff,
 						    ipv6_hdr(skb)->nexthdr,
 						    skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						      skb->len - udphoff,
 						      ip_hdr(skb)->protocol,
 						      skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index bbc1ac795952..076ebe00435d 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,7 +35,7 @@
 static LIST_HEAD(ip_vs_schedulers);
 
 /* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
+static DEFINE_SPINLOCK(ip_vs_sched_lock);
 
 
 /*
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
 	int ret;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-	if (scheduler == NULL) {
-		pr_err("%s(): scheduler arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
 	svc->scheduler = scheduler;
 
 	if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
  */
 int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-	struct ip_vs_scheduler *sched;
+	struct ip_vs_scheduler *sched = svc->scheduler;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
-	sched = svc->scheduler;
-	if (sched == NULL) {
-		pr_err("%s(): svc isn't bound\n", __func__);
-		return -EINVAL;
-	}
+	if (!sched)
+		return 0;
 
 	if (sched->done_service) {
 		if (sched->done_service(svc) != 0) {
@@ -108,7 +91,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 
 	IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
 
-	read_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		/*
@@ -122,14 +105,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 		}
 		if (strcmp(sched_name, sched->name)==0) {
 			/* HIT */
-			read_unlock_bh(&__ip_vs_sched_lock);
+			spin_unlock_bh(&ip_vs_sched_lock);
 			return sched;
 		}
 		if (sched->module)
 			module_put(sched->module);
 	}
 
-	read_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 	return NULL;
 }
 
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
 
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 {
-	if (scheduler->module)
+	if (scheduler && scheduler->module)
 		module_put(scheduler->module);
 }
 
@@ -184,10 +167,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	write_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 
 	if (!list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
+		spin_unlock_bh(&ip_vs_sched_lock);
 		ip_vs_use_count_dec();
 		pr_err("%s(): [%s] scheduler already linked\n",
 		       __func__, scheduler->name);
@@ -200,7 +183,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 */
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		if (strcmp(scheduler->name, sched->name) == 0) {
-			write_unlock_bh(&__ip_vs_sched_lock);
+			spin_unlock_bh(&ip_vs_sched_lock);
 			ip_vs_use_count_dec();
 			pr_err("%s(): [%s] scheduler already existed "
 			       "in the system\n", __func__, scheduler->name);
@@ -211,7 +194,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Add it into the d-linked scheduler list
 	 */
 	list_add(&scheduler->n_list, &ip_vs_schedulers);
-	write_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 
 	pr_info("[%s] scheduler registered.\n", scheduler->name);
 
@@ -229,9 +212,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 		return -EINVAL;
 	}
 
-	write_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 	if (list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
+		spin_unlock_bh(&ip_vs_sched_lock);
 		pr_err("%s(): [%s] scheduler is not in the list. failed\n",
 		       __func__, scheduler->name);
 		return -EINVAL;
@@ -241,7 +224,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Remove it from the d-linked scheduler list
 	 */
 	list_del(&scheduler->n_list);
-	write_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba06939829f..ab85aedea17e 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 		ip_vs_sync_conn(cp->control);
 }
 
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+			   const union nf_inet_addr *caddr, __be16 cport,
+			   const union nf_inet_addr *vaddr, __be16 vport,
+			   struct ip_vs_conn_param *p)
+{
+	/* XXX: Need to take into account persistence engine */
+	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+	return 0;
+}
 
 /*
  *      Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_dest *dest;
+	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			}
 		}
 
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport);
-		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-					     (union nf_inet_addr *)&s->caddr,
-					     s->cport,
-					     (union nf_inet_addr *)&s->vaddr,
-					     s->vport);
+		{
+			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+					      (union nf_inet_addr *)&s->caddr,
+					      s->cport,
+					      (union nf_inet_addr *)&s->vaddr,
+					      s->vport, &param)) {
+				pr_err("ip_vs_conn_fill_param_sync failed");
+				return;
+			}
+			if (!(flags & IP_VS_CONN_F_TEMPLATE))
+				cp = ip_vs_conn_in_get(&param);
+			else
+				cp = ip_vs_ct_in_get(&param);
+		}
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)&s->caddr,
-					    s->cport,
-					    (union nf_inet_addr *)&s->vaddr,
-					    s->vport,
+			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport,
-					    flags, dest);
+					    s->dport, flags, dest);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 49df6bea6a2d..de04ea39cde8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -26,9 +36,9 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip_vs.h>
@@ -38,26 +48,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;
 
 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }
 
 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;
 
 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -66,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
 	return dst;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+		   __be32 daddr, u32 rtos, int rt_mode)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rtable *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rtable *ort;			/* Original route */
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -85,13 +104,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 						.tos = rtos, } },
 			};
 
-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
 				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -102,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 			.oif = 0,
 			.nl_u = {
 				.ip4_u = {
-					.daddr = cp->daddr.ip,
+					.daddr = daddr,
 					.saddr = 0,
 					.tos = rtos, } },
 		};
 
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
-				     &cp->daddr.ip);
+				     &daddr);
 			return NULL;
 		}
 	}
 
+	local = rt->rt_flags & RTCF_LOCAL;
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+			     (rt->rt_flags & RTCF_LOCAL) ?
+			     "local":"non-local", &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+					 ort->rt_flags & RTCF_LOCAL)) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+			     "requires NAT method, dest: %pI4\n",
+			     &ip_hdr(skb)->daddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+			     "to non-local address, dest: %pI4\n",
+			     &ip_hdr(skb)->saddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+
 	return rt;
 }
 
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+	struct net *net = dev_net(dev);
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (rt->fl.iif) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
+			return 0;
+		refdst_drop(orefdst);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos),
+				}
+			},
+			.mark = skb->mark,
+		};
+		struct rtable *rt;
+
+		if (ip_route_output_key(net, &rt, &fl))
+			return 0;
+		if (!(rt->rt_flags & RTCF_LOCAL)) {
+			ip_rt_put(rt);
+			return 0;
+		}
+		/* Drop old route. */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	}
+	return 1;
+}
+
 #ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+	return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
+		      int do_xfrm, int rt_mode)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rt6_info *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rt6_info *ort;			/* Original route */
+	struct dst_entry *dst;
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;
 
-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
 				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
+	}
+
+	local = __ip_vs_is_local_route6(rt);
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+			     local ? "local":"non-local", daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) &&
+	    !((ort = (struct rt6_info *) skb_dst(skb)) &&
+	      __ip_vs_is_local_route6(ort))) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+			     "requires NAT method, dest: %pI6\n",
+			     &ipv6_hdr(skb)->daddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+				    IPV6_ADDR_LOOPBACK)) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+			     "to non-local address, dest: %pI6\n",
+			     &ipv6_hdr(skb)->saddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
 	}
 
 	return rt;
@@ -194,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	dst_release(old_dst);
 }
 
-#define IP_VS_XMIT(pf, skb, rt)				\
+#define IP_VS_XMIT_TUNNEL(skb, cp)				\
+({								\
+	int __ret = NF_ACCEPT;					\
+								\
+	(skb)->ipvs_property = 1;				\
+	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
+		__ret = ip_vs_confirm_conntrack(skb, cp);	\
+	if (__ret == NF_ACCEPT) {				\
+		nf_reset(skb);					\
+		skb_forward_csum(skb);				\
+	}							\
+	__ret;							\
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\
+do {							\
+	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	else						\
+		ip_vs_update_conntrack(skb, cp, 1);	\
+	if (local)					\
+		return NF_ACCEPT;			\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		skb_dst(skb)->dev, dst_output);		\
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local)			\
 do {							\
 	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	if (local)					\
+		return NF_ACCEPT;			\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		(rt)->dst.dev, dst_output);		\
+		skb_dst(skb)->dev, dst_output);		\
 } while (0)
 
 
@@ -211,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_protocol *pp)
 {
 	/* we do not touch skb and do not need pskb ptr */
-	return NF_ACCEPT;
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -226,24 +398,13 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
-	u8     tos = iph->tos;
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip4_u = {
-				.daddr = iph->daddr,
-				.saddr = 0,
-				.tos = RT_TOS(tos), } },
-	};
 
 	EnterFunction(10);
 
-	if (ip_route_output_key(&init_net, &rt, &fl)) {
-		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
-			     __func__, &iph->daddr);
+	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+				      RT_TOS(iph->tos), 2)))
 		goto tx_error_icmp;
-	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -271,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -292,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};
 
 	EnterFunction(10);
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
 		goto tx_error_icmp;
-	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
 	}
@@ -335,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -349,36 +504,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 }
 #endif
 
-void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
-{
-	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
-	struct nf_conntrack_tuple new_tuple;
-
-	if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
-		return;
-
-	/*
-	 * The connection is not yet in the hashtable, so we update it.
-	 * CIP->VIP will remain the same, so leave the tuple in
-	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
-	 * real-server we will see RIP->DIP.
-	 */
-	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-	if (outin)
-		new_tuple.src.u3 = cp->daddr;
-	else
-		new_tuple.dst.u3 = cp->vaddr;
-	/*
-	 * This will also take care of UDP and other protocols.
-	 */
-	if (outin)
-		new_tuple.src.u.tcp.port = cp->dport;
-	else
-		new_tuple.dst.u.tcp.port = cp->vport;
-	nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
  *      Not used for related ICMP
@@ -390,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int mtu;
 	struct iphdr *iph = ip_hdr(skb);
+	int local;
 
 	EnterFunction(10);
 
@@ -403,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+					 "ip_vs_nat_xmit(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-		goto tx_error;
+		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+				 "ip_vs_nat_xmit(): frag needed for");
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -422,19 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-		goto tx_error;
+		goto tx_error_put;
 	ip_hdr(skb)->daddr = cp->daddr.ip;
 	ip_send_check(ip_hdr(skb));
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	if (!local) {
+		/* drop old route */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
 
-	ip_vs_update_conntrack(skb, cp, 1);
+	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -443,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -451,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
   tx_error_icmp:
 	dst_link_failure(skb);
   tx_error:
-	LeaveFunction(10);
 	kfree_skb(skb);
+	LeaveFunction(10);
 	return NF_STOLEN;
   tx_error_put:
 	ip_rt_put(rt);
@@ -466,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	int mtu;
+	int local;
 
 	EnterFunction(10);
 
@@ -480,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+					 "ip_vs_nat_xmit_v6(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -501,18 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
 		goto tx_error;
-	ipv6_hdr(skb)->daddr = cp->daddr.in6;
+	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
 
-	ip_vs_update_conntrack(skb, cp, 1);
+	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -521,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -567,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IP)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IP: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IP), skb->protocol);
-		goto tx_error;
-	}
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	tdev = rt->dst.dev;
 
 	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 	if (mtu < 68) {
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -600,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if ((old_iph->frag_off & htons(IP_DF))
 	    && mtu < ntohs(old_iph->tot_len)) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -625,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -655,7 +847,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -667,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -675,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IPV6)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IPV6: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IPV6), skb->protocol);
-		goto tx_error;
-	}
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+					 &saddr, 1, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	tdev = rt->dst.dev;
 
 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
-		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
-		goto tx_error;
+	if (mtu < IPV6_MIN_MTU) {
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -734,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -754,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip6_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip6_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -773,6 +977,9 @@ tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
 }
 #endif
 
@@ -791,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -820,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -843,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -873,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -899,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -919,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI4\n",
+				  __func__, &cp->daddr.ip);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI4\n",
+			  __func__, &cp->daddr.ip);
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -938,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
+	if (!local) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
@@ -973,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -993,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
 
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI6\n",
+				  __func__, &cp->daddr.in6);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI6\n",
+			  __func__, &cp->daddr.in6);
+		goto tx_error_put;
+	}
+
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -1013,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb142ff..1eacf8d9966a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,32 +65,42 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
 {
 	unsigned int n;
-	u_int32_t h;
 
 	/* The direction must be ignored, so we hash everything up to the
 	 * destination ports (which is a multiple of 4) and treat the last
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	h = jhash2((u32 *)tuple, n,
-		   zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
-				 tuple->dst.protonum));
+	return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+		      (((__force __u16)tuple->dst.u.all << 16) |
+		      tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+	return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+	return __hash_bucket(hash, net->ct.htable_size);
+}
 
-	return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+				  u16 zone, unsigned int size)
+{
+	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
-	return __hash_conntrack(tuple, zone, net->ct.htable_size,
-				nf_conntrack_hash_rnd);
+	return __hash_conntrack(tuple, zone, net->ct.htable_size);
 }
 
 bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
  * OR
  * - Caller must lock nf_conntrack_lock before calling this function
  */
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
-		    const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
-	unsigned int hash = hash_conntrack(net, zone, tuple);
+	unsigned int bucket = hash_bucket(hash, net);
 
 	/* Disable BHs the entire time since we normally need to disable them
 	 * at least once for the stats anyway.
 	 */
 	local_bh_disable();
 begin:
-	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
 		if (nf_ct_tuple_equal(tuple, &h->tuple) &&
 		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
 			NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
 	 * not the expected one, we must restart lookup.
 	 * We probably met an item that was moved to another chain.
 	 */
-	if (get_nulls_value(n) != hash) {
+	if (get_nulls_value(n) != bucket) {
 		NF_CT_STAT_INC(net, search_restart);
 		goto begin;
 	}
@@ -327,19 +337,27 @@ begin:
 
 	return NULL;
 }
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+		    const struct nf_conntrack_tuple *tuple)
+{
+	return ____nf_conntrack_find(net, zone, tuple,
+				     hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
-		      const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+			const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
 	rcu_read_lock();
 begin:
-	h = __nf_conntrack_find(net, zone, tuple);
+	h = ____nf_conntrack_find(net, zone, tuple, hash);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
 
 	return h;
 }
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	return __nf_conntrack_find_get(net, zone, tuple,
+				       hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		return NF_ACCEPT;
 
 	zone = nf_ct_zone(ct);
-	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	/* reuse the hash saved before */
+	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+	hash = hash_bucket(hash, net);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
 	/* We're not in hash table, and we refuse to set up related
 	   connections for unconfirmed conns.  But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 	return dropped;
 }
 
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
-				   const struct nf_conntrack_tuple *orig,
-				   const struct nf_conntrack_tuple *repl,
-				   gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+		     const struct nf_conntrack_tuple *orig,
+		     const struct nf_conntrack_tuple *repl,
+		     gfp_t gfp, u32 hash)
 {
 	struct nf_conn *ct;
 
-	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
-		get_random_bytes(&nf_conntrack_hash_rnd,
-				sizeof(nf_conntrack_hash_rnd));
-		nf_conntrack_hash_rnd_initted = 1;
+	if (unlikely(!nf_conntrack_hash_rnd)) {
+		unsigned int rand;
+
+		/*
+		 * Why not initialize nf_conntrack_rnd in a "init()" function ?
+		 * Because there isn't enough entropy when system initializing,
+		 * and we initialize it as late as possible.
+		 */
+		do {
+			get_random_bytes(&rand, sizeof(rand));
+		} while (!rand);
+		cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
+		hash = hash_conntrack_raw(orig, zone);
 	}
 
 	/* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 
 	if (nf_conntrack_max &&
 	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-		unsigned int hash = hash_conntrack(net, zone, orig);
-		if (!early_drop(net, hash)) {
+		if (!early_drop(net, hash_bucket(hash, net))) {
 			atomic_dec(&net->ct.count);
 			if (net_ratelimit())
 				printk(KERN_WARNING
@@ -616,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
-	ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+	/* save hash for reusing when confirming */
+	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
 	/* Don't set timer yet: wait for confirmation */
 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
 	write_pnet(&ct->ct_net, net);
@@ -643,6 +684,14 @@ out_free:
 	return ERR_PTR(-ENOMEM);
 #endif
 }
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+				   const struct nf_conntrack_tuple *orig,
+				   const struct nf_conntrack_tuple *repl,
+				   gfp_t gfp)
+{
+	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 
 void nf_conntrack_free(struct nf_conn *ct)
@@ -664,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	       struct nf_conntrack_l3proto *l3proto,
 	       struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
-	       unsigned int dataoff)
+	       unsigned int dataoff, u32 hash)
 {
 	struct nf_conn *ct;
 	struct nf_conn_help *help;
@@ -678,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
-	ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+				  hash);
 	if (IS_ERR(ct)) {
 		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)ct;
@@ -755,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
@@ -764,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	hash = hash_conntrack_raw(&tuple, zone);
+	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
-				   skb, dataoff);
+				   skb, dataoff, hash);
 		if (!h)
 			return NULL;
 		if (IS_ERR(h))
@@ -1307,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			hlist_nulls_del_rcu(&h->hnnode);
 			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-						  hashsize,
-						  nf_conntrack_hash_rnd);
+						  hashsize);
 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
 		}
 	}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29ccaa41f..46e8966912b1 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
 /* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+				u32 pid, int report)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
 
-	NF_CT_ASSERT(master_help);
 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
 	net->ct.expect_count--;
 
 	hlist_del(&exp->lnode);
-	master_help->expecting[exp->class]--;
+	if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+		master_help->expecting[exp->class]--;
+
+	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
 	nf_ct_expect_put(exp);
 
 	NF_CT_STAT_INC(net, expect_delete);
 }
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
 {
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 
 	atomic_inc(&exp->use);
 
-	hlist_add_head(&exp->lnode, &master_help->expectations);
-	master_help->expecting[exp->class]++;
+	if (master_help) {
+		hlist_add_head(&exp->lnode, &master_help->expectations);
+		master_help->expecting[exp->class]++;
+	} else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+		hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
 
 	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
 	net->ct.expect_count++;
 
 	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
 		    (unsigned long)exp);
-	p = &master_help->helper->expect_policy[exp->class];
-	exp->timeout.expires = jiffies + p->timeout * HZ;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[exp->class];
+		exp->timeout.expires = jiffies + p->timeout * HZ;
+	}
 	add_timer(&exp->timeout);
 
 	atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 	unsigned int h;
 	int ret = 1;
 
-	if (!master_help->helper) {
+	/* Don't allow expectations created from kernel-space with no helper */
+	if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+	    (!master_help || (master_help && !master_help->helper))) {
 		ret = -ESHUTDOWN;
 		goto out;
 	}
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 		}
 	}
 	/* Will be over limit? */
-	p = &master_help->helper->expect_policy[expect->class];
-	if (p->max_expected &&
-	    master_help->expecting[expect->class] >= p->max_expected) {
-		evict_oldest_expect(master, expect);
-		if (master_help->expecting[expect->class] >= p->max_expected) {
-			ret = -EMFILE;
-			goto out;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[expect->class];
+		if (p->max_expected &&
+		    master_help->expecting[expect->class] >= p->max_expected) {
+			evict_oldest_expect(master, expect);
+			if (master_help->expecting[expect->class]
+						>= p->max_expected) {
+				ret = -EMFILE;
+				goto out;
+			}
 		}
 	}
 
@@ -439,6 +454,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
 
+void nf_ct_remove_userspace_expectations(void)
+{
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	hlist_for_each_entry_safe(exp, n, next,
+				  &nf_ct_userspace_expect_list, lnode) {
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect(exp);
+			nf_ct_expect_put(exp);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
 #ifdef CONFIG_PROC_FS
 struct ct_expect_iter_state {
 	struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		seq_printf(s, "PERMANENT");
 		delim = ",";
 	}
-	if (expect->flags & NF_CT_EXPECT_INACTIVE)
+	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
 		seq_printf(s, "%sINACTIVE", delim);
+		delim = ",";
+	}
+	if (expect->flags & NF_CT_EXPECT_USERSPACE)
+		seq_printf(s, "%sUSERSPACE", delim);
 
 	helper = rcu_dereference(nfct_help(expect->master)->helper);
 	if (helper) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 146476c6441a..b729ace1dcc1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1588,8 +1588,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 			  const struct nf_conntrack_expect *exp)
 {
 	struct nf_conn *master = exp->master;
-	struct nf_conntrack_helper *helper;
 	long timeout = (exp->timeout.expires - jiffies) / HZ;
+	struct nf_conn_help *help;
 
 	if (timeout < 0)
 		timeout = 0;
@@ -1605,9 +1605,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 
 	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
 	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
-	helper = rcu_dereference(nfct_help(master)->helper);
-	if (helper)
-		NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+	help = nfct_help(master);
+	if (help) {
+		struct nf_conntrack_helper *helper;
+
+		helper = rcu_dereference(help->helper);
+		if (helper)
+			NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	}
 
 	return 0;
 
@@ -1654,17 +1660,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct sk_buff *skb;
-	unsigned int type;
+	unsigned int type, group;
 	int flags = 0;
 
-	if (events & (1 << IPEXP_NEW)) {
+	if (events & (1 << IPEXP_DESTROY)) {
+		type = IPCTNL_MSG_EXP_DELETE;
+		group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+	} else if (events & (1 << IPEXP_NEW)) {
 		type = IPCTNL_MSG_EXP_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
+		group = NFNLGRP_CONNTRACK_EXP_NEW;
 	} else
 		return 0;
 
-	if (!item->report &&
-	    !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+	if (!item->report && !nfnetlink_has_listeners(net, group))
 		return 0;
 
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1687,8 +1696,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
-		       item->report, GFP_ATOMIC);
+	nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
 	return 0;
 
 nla_put_failure:
@@ -1761,6 +1769,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
 	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
 	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
 	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int
@@ -1869,7 +1879,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		}
 
 		/* after list removal, usage count == 1 */
-		nf_ct_unexpect_related(exp);
+		spin_lock_bh(&nf_conntrack_lock);
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh));
+			nf_ct_expect_put(exp);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
 		/* have to put what we 'get' above.
 		 * after this line usage count == 0 */
 		nf_ct_expect_put(exp);
@@ -1886,7 +1902,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 				m_help = nfct_help(exp->master);
 				if (!strcmp(m_help->helper->name, name) &&
 				    del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
@@ -1900,7 +1918,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 						  &net->ct.expect_hash[i],
 						  hnode) {
 				if (del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
@@ -1946,23 +1966,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 	if (!h)
 		return -ENOENT;
 	ct = nf_ct_tuplehash_to_ctrack(h);
-	help = nfct_help(ct);
-
-	if (!help || !help->helper) {
-		/* such conntrack hasn't got any helper, abort */
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
 	exp = nf_ct_expect_alloc(ct);
 	if (!exp) {
 		err = -ENOMEM;
 		goto out;
 	}
+	help = nfct_help(ct);
+	if (!help) {
+		if (!cda[CTA_EXPECT_TIMEOUT]) {
+			err = -EINVAL;
+			goto out;
+		}
+		exp->timeout.expires =
+		  jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+		exp->flags = NF_CT_EXPECT_USERSPACE;
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags |=
+				ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+		}
+	} else {
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+			exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+		} else
+			exp->flags = 0;
+	}
 
 	exp->class = 0;
 	exp->expectfn = NULL;
-	exp->flags = 0;
 	exp->master = ct;
 	exp->helper = NULL;
 	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2130,6 +2162,7 @@ static void __exit ctnetlink_exit(void)
 {
 	pr_info("ctnetlink: unregistering from nfnetlink.\n");
 
+	nf_ct_remove_userspace_expectations();
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
 	nf_conntrack_unregister_notifier(&ctnl_notifier);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index f64de9544866..bcf47eb518ef 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
 	return len;
 }
 
+static int iswordc(const char c)
+{
+	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+	    c == '{' || c == '}' || c == '~')
+		return 1;
+	return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+	int len = 0;
+	while (dptr < limit && iswordc(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len, domain_len;
+
+	len = word_len(dptr, limit);
+	dptr += len;
+	if (!len || dptr == limit || *dptr != '@')
+		return len;
+	dptr++;
+	len++;
+
+	domain_len = word_len(dptr, limit);
+	if (!domain_len)
+		return 0;
+	return len + domain_len;
+}
+
 /* get media type + port length */
 static int media_len(const struct nf_conn *ct, const char *dptr,
 		     const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
 	const char *end;
 	int ret = 0;
 
+	if (!ct)
+		return 0;
+
 	memset(addr, 0, sizeof(*addr));
 	switch (nf_ct_l3num(ct)) {
 	case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
 	[SIP_HDR_VIA_TCP]		= SIP_HDR("Via", "v", "TCP ", epaddr_len),
 	[SIP_HDR_EXPIRES]		= SIP_HDR("Expires", NULL, NULL, digits_len),
 	[SIP_HDR_CONTENT_LENGTH]	= SIP_HDR("Content-Length", "l", NULL, digits_len),
+	[SIP_HDR_CALL_ID]		= SIP_HDR("Call-Id", "i", NULL, callid_len),
 };
 
 static const char *sip_follow_continuation(const char *dptr, const char *limit)
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index daab8c4a903c..4d87befb04c0 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,41 +18,6 @@
 #include <net/udp.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, bool listening_only)
-{
-	struct sock *sk;
-
-	/* look up socket */
-	switch (protocol) {
-	case IPPROTO_TCP:
-		if (listening_only)
-			sk = __inet_lookup_listener(net, &tcp_hashinfo,
-						    daddr, ntohs(dport),
-						    in->ifindex);
-		else
-			sk = __inet_lookup(net, &tcp_hashinfo,
-					   saddr, sport, daddr, dport,
-					   in->ifindex);
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
-
-	return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
 
 static void
 nf_tproxy_destructor(struct sk_buff *skb)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622fa0003..80463507420e 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
 void
 xt_unregister_targets(struct xt_target *target, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_target(&target[i]);
+	while (n-- > 0)
+		xt_unregister_target(&target[n]);
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
 void
 xt_unregister_matches(struct xt_match *match, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_match(&match[i]);
+	while (n-- > 0)
+		xt_unregister_match(&match[n]);
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index c61294d85fda..19c482caf30b 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
 /*
  * Transparent proxy support for Linux/iptables
  *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
  * Author: Balazs Scheidler, Krisztian Kovacs
  *
  * This program is free software; you can redistribute it and/or modify
@@ -16,19 +16,96 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	rcu_read_lock();
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+			struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+	   u_int32_t mark_mask, u_int32_t mark_value)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
 
@@ -36,12 +113,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (hp == NULL)
 		return NF_DROP;
 
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-				   iph->saddr,
-				   tgi->laddr ? tgi->laddr : iph->daddr,
-				   hp->source,
-				   tgi->lport ? tgi->lport : hp->dest,
-				   par->in, true);
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	if (!lport)
+		lport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, laddr,
+					   hp->source, lport,
+					   skb->dev, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+			 iph->protocol, &iph->daddr, ntohs(hp->dest),
+			 &laddr, ntohs(lport), skb->mark);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+		 iph->protocol, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+	return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	rcu_read_lock();
+	indev = __in6_dev_get(skb->dev);
+	if (indev)
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+	const struct in6_addr *laddr;
+	__be16 lport;
+	int thoff;
+	int tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	lport = tgi->lport ? tgi->lport : hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					   &iph->saddr, laddr,
+					   hp->source, lport,
+					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -49,19 +309,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
-		pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-			 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-			 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+			 tproto, &iph->saddr, ntohs(hp->source),
+			 laddr, ntohs(lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
-	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+	pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+		 tproto, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+
 	return NF_DROP;
 }
 
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_ip6 *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->flags & IP6T_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_ip *i = par->entryinfo;
 
@@ -74,31 +349,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
 	return -EINVAL;
 }
 
-static struct xt_target tproxy_tg_reg __read_mostly = {
-	.name		= "TPROXY",
-	.family		= AF_INET,
-	.table		= "mangle",
-	.target		= tproxy_tg,
-	.targetsize	= sizeof(struct xt_tproxy_target_info),
-	.checkentry	= tproxy_tg_check,
-	.hooks		= 1 << NF_INET_PRE_ROUTING,
-	.me		= THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v0,
+		.revision	= 0,
+		.targetsize	= sizeof(struct xt_tproxy_target_info),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV6,
+		.table		= "mangle",
+		.target		= tproxy_tg6_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg6_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#endif
+
 };
 
 static int __init tproxy_tg_init(void)
 {
 	nf_defrag_ipv4_enable();
-	return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 static void __exit tproxy_tg_exit(void)
 {
-	xt_unregister_target(&tproxy_tg_reg);
+	xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 module_init(tproxy_tg_init);
 module_exit(tproxy_tg_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
 MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
 MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b46a8390896d..9228ee0dc11a 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 {
 	__be16 _ports[2], *ports;
 	u8 nexthdr;
+	int poff;
 
 	memset(dst, 0, sizeof(*dst));
 
@@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 		return 0;
 	}
 
-	switch (nexthdr) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
-		ports = skb_header_pointer(skb, protoff, sizeof(_ports),
+	poff = proto_ports_offset(nexthdr);
+	if (poff >= 0) {
+		ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
 					   &_ports);
-		break;
-	default:
+	} else {
 		_ports[0] = _ports[1] = 0;
 		ports = _ports;
-		break;
 	}
 	if (!ports)
 		return -1;
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 7a4d66db95ae..9127a3d8aa35 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -16,7 +16,6 @@
 #include <linux/ip_vs.h>
 #include <linux/types.h>
 #include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_ipvs.h>
 #include <net/netfilter/nf_conntrack.h>
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1ca89908cbad..2dbd4c857735 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/icmp.h>
@@ -21,6 +22,7 @@
 #include <net/inet_sock.h>
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 #include <linux/netfilter/xt_socket.h>
 
@@ -30,7 +32,7 @@
 #endif
 
 static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
 		    __be32 *raddr,
 		    __be32 *laddr,
@@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb,
 	return 0;
 }
 
-
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	     const struct xt_socket_mtinfo1 *info)
@@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		dport = hp->dest;
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
-		if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
 					&sport, &dport))
 			return false;
 	} else {
@@ -142,7 +143,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 #endif
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-				   saddr, daddr, sport, dport, par->in, false);
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
 	if (sk != NULL) {
 		bool wildcard;
 		bool transparent = true;
@@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 			sk = NULL;
 	}
 
-	pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport),
-		 ntohl(daddr), ntohs(dport),
-		 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+	pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+		 protocol, &saddr, ntohs(sport),
+		 &daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
 
 	return (sk != NULL);
 }
 
 static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, NULL);
 }
 
 static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, par->matchinfo);
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+		     unsigned int outside_hdrlen,
+		     u8 *protocol,
+		     struct in6_addr **raddr,
+		     struct in6_addr **laddr,
+		     __be16 *rport,
+		     __be16 *lport)
+{
+	struct ipv6hdr *inside_iph, _inside_iph;
+	struct icmp6hdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+	u8 inside_nexthdr;
+	int inside_hdrlen;
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+	inside_nexthdr = inside_iph->nexthdr;
+
+	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+	if (inside_hdrlen < 0)
+		return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+	if (inside_nexthdr != IPPROTO_TCP &&
+	    inside_nexthdr != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, inside_hdrlen,
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_nexthdr;
+	*laddr = &inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = &inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	struct in6_addr *daddr, *saddr;
+	__be16 dport, sport;
+	int thoff;
+	u8 tproto;
+	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return false;
+
+		saddr = &iph->saddr;
+		sport = hp->source;
+		daddr = &iph->daddr;
+		dport = hp->dest;
+
+	} else if (tproto == IPPROTO_ICMPV6) {
+		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+					 &sport, &dport))
+			return false;
+	} else {
+		return false;
+	}
+
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		nf_tproxy_put_sock(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu "
+		 "(orig %pI6:%hu) sock %p\n",
+		 tproto, saddr, ntohs(sport),
+		 daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+	return (sk != NULL);
+}
+#endif
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
 	{
 		.name		= "socket",
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v0,
+		.match		= socket_mt4_v0,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
@@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v1,
+		.match		= socket_mt4_v1,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
 };
 
 static int __init socket_mt_init(void)
 {
 	nf_defrag_ipv4_enable();
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
 	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
@@ -225,3 +367,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
 MODULE_DESCRIPTION("x_tables socket match module");
 MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-23 11:47:02 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-23 11:47:02 -0700
commit	5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0 (patch)
tree	7851ef1c93aa1aba7ef327ca4b75fd35e6d10f29 /net/netfilter
parent	02f36038c568111ad4fc433f6fa760ff5e38fab4 (diff)
parent	ec37a48d1d16c30b655ac5280209edf52a6775d4 (diff)
download	talos-op-linux-5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0.tar.gz talos-op-linux-5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0.zip