56 files changed, 3393 insertions, 1349 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f75322377..da33393be45f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
 	tristate
 	default n
 
+config INET_XFRM_MODE_TRANSPORT
+	tristate "IP: IPsec transport mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+	tristate "IP: IPsec tunnel mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE
 	properties, though is known to have fairness issues.
 	See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
 
+config TCP_CONG_LP
+	tristate "TCP Low Priority"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+	to utiliza only the excess network bandwidth as compared to the
+	``fair share`` of bandwidth as targeted by TCP.
+	See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+	tristate "TCP Veno"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Veno is a sender-side only enhancement of TCP to obtain better
+	throughput over wireless networks. TCP Veno makes use of state
+	distinguishing to circumvent the difficult judgment of the packet loss
+	type. TCP Veno cuts down less congestion window in response to random
+	loss packets.
+	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+
+config TCP_CONG_COMPOUND
+	tristate "TCP Compound"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Compound is a sender-side only change to TCP that uses
+	a mixed Reno/Vegas approach to calculate the cwnd.
+	For further details look here:
+	  ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
+
 endmenu
 
 config TCP_CONG_BIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2c..38b8039bdd55 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e2e4771fa4c6..c7782230080d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -119,6 +119,7 @@ error:
 static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int ah_hlen;
+	int ihl;
 	struct iphdr *iph;
 	struct ip_auth_hdr *ah;
 	struct ah_data *ahp;
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	ah = (struct ip_auth_hdr*)skb->data;
 	iph = skb->nh.iph;
 
-	memcpy(work_buf, iph, iph->ihl*4);
+	ihl = skb->data - skb->nh.raw;
+	memcpy(work_buf, iph, ihl);
 
 	iph->ttl = 0;
 	iph->tos = 0;
 	iph->frag_off = 0;
 	iph->check = 0;
-	if (iph->ihl != 5) {
+	if (ihl > sizeof(*iph)) {
 		u32 dummy;
 		if (ip_clear_mutable_options(iph, &dummy))
 			goto out;
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 		u8 auth_data[MAX_AH_AUTH_LEN];
 		
 		memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
-		skb_push(skb, skb->data - skb->nh.raw);
+		skb_push(skb, ihl);
 		ahp->icv(ahp, skb, ah->auth_data);
 		if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
 			x->stats.integrity_failed++;
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 		}
 	}
 	((struct iphdr*)work_buf)->protocol = ah->nexthdr;
-	skb->nh.raw = skb_pull(skb, ah_hlen);
-	memcpy(skb->nh.raw, work_buf, iph->ihl*4);
-	skb->nh.iph->tot_len = htons(skb->len);
-	skb_pull(skb, skb->nh.iph->ihl*4);
-	skb->h.raw = skb->data;
+	skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl);
+	__skb_pull(skb, ah_hlen + ihl);
 
 	return 0;
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9d1881c07a32..9bbdd4494551 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	int alen = esp->auth.icv_trunc_len;
 	int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
 	int nfrags;
-	int encap_len = 0;
+	int ihl;
 	u8 nexthdr[2];
 	struct scatterlist *sg;
-	u8 workbuf[60];
 	int padlen;
 
 	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	skb->ip_summed = CHECKSUM_NONE;
 
 	esph = (struct ip_esp_hdr*)skb->data;
-	iph = skb->nh.iph;
 
 	/* Get ivec. This can be wrong, check against another impls. */
 	if (esp->conf.ivlen)
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	/* ... check padding bits here. Silly. :-) */ 
 
+	iph = skb->nh.iph;
+	ihl = iph->ihl * 4;
+
 	if (x->encap) {
 		struct xfrm_encap_tmpl *encap = x->encap;
-		struct udphdr *uh;
-
-		uh = (struct udphdr *)(iph + 1);
-		encap_len = (void*)esph - (void*)uh;
+		struct udphdr *uh = (void *)(skb->nh.raw + ihl);
 
 		/*
 		 * 1) if the NAT-T peer's IP or port changed then
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	iph->protocol = nexthdr[1];
 	pskb_trim(skb, skb->len - alen - padlen - 2);
-	memcpy(workbuf, skb->nh.raw, iph->ihl*4);
-	skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
-	skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
-	memcpy(skb->nh.raw, workbuf, iph->ihl*4);
-	skb->nh.iph->tot_len = htons(skb->len);
+	skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl;
 
 	return 0;
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cdde96390960..31387abf53a2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -666,3 +666,4 @@ void __init ip_fib_init(void)
 }
 
 EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2a0455911ee0..017900172f7d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -730,7 +730,6 @@ out_err:
 static void icmp_redirect(struct sk_buff *skb)
 {
 	struct iphdr *iph;
-	unsigned long ip;
 
 	if (skb->len < sizeof(struct iphdr))
 		goto out_err;
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb)
 		goto out;
 
 	iph = (struct iphdr *)skb->data;
-	ip = iph->daddr;
 
 	switch (skb->h.icmph->code & 7) {
 	case ICMP_REDIR_NET:
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb)
 		 */
 	case ICMP_REDIR_HOST:
 	case ICMP_REDIR_HOSTTOS:
-		ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+		ip_rt_redirect(skb->nh.iph->saddr, iph->daddr,
+			       skb->h.icmph->un.gateway,
 			       iph->saddr, skb->dev);
 		break;
   	}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a1473..ab680c851aa2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 		}
 
 		seq_printf(seq,
-			   "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
 			   im->multiaddr, im->users,
 			   im->tm_running, im->tm_running ?
 			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 0923add122b4..9f0bb529ab70 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -116,6 +116,7 @@ sr_failed:
 
 too_many_hops:
         /* Tell the sender its packet died... */
+        IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
         icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
 drop:
 	kfree_skb(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72daf..8538aac3d148 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+	skb_copy_secmark(to, from);
 }
 
 /*
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk,
 	 */
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
-	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+	    rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 	    !exthdrlen)
 		csummode = CHECKSUM_HW;
 
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 95278b22b669..3ed8b57a1002 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list);
 static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err, plen, dlen;
-	struct iphdr *iph;
 	struct ipcomp_data *ipcd = x->data;
 	u8 *start, *scratch;
 	struct crypto_tfm *tfm;
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
 		
 	skb_put(skb, dlen - plen);
 	memcpy(skb->data, scratch, dlen);
-	iph = skb->nh.iph;
-	iph->tot_len = htons(dlen + iph->ihl * 4);
 out:	
 	put_cpu();
 	return err;
@@ -83,34 +80,21 @@ out:
 
 static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	u8 nexthdr;
-	int err = 0;
+	int err = -ENOMEM;
 	struct iphdr *iph;
-	union {
-		struct iphdr	iph;
-		char 		buf[60];
-	} tmp_iph;
-
+	struct ip_comp_hdr *ipch;
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
-	    	err = -ENOMEM;
+	if (skb_linearize_cow(skb))
 	    	goto out;
-	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
 	/* Remove ipcomp header and decompress original payload */	
 	iph = skb->nh.iph;
-	memcpy(&tmp_iph, iph, iph->ihl * 4);
-	nexthdr = *(u8 *)skb->data;
-	skb_pull(skb, sizeof(struct ip_comp_hdr));
-	skb->nh.raw += sizeof(struct ip_comp_hdr);
-	memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
-	iph = skb->nh.iph;
-	iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
-	iph->protocol = nexthdr;
-	skb->h.raw = skb->data;
+	ipch = (void *)skb->data;
+	iph->protocol = ipch->nexthdr;
+	skb->h.raw = skb->nh.raw + sizeof(*ipch);
+	__skb_pull(skb, sizeof(*ipch));
 	err = ipcomp_decompress(x, skb);
 
 out:	
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_ok;
 	}
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
+	if (skb_linearize_cow(skb))
 		goto out_ok;
-	}
 	
 	err = ipcomp_compress(x, skb);
 	iph = skb->nh.iph;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 3d560dec63ab..e1d7f5fbc526 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config IP_NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && IP_NF_CONNTRACK
@@ -142,6 +154,8 @@ config IP_NF_TFTP
 config IP_NF_AMANDA
 	tristate "Amanda backup protocol support"
 	depends on IP_NF_CONNTRACK
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
 	help
 	  If you are running the Amanda backup package <http://www.amanda.org/>
 	  on this machine or machines that will be MASQUERADED through this
@@ -170,8 +184,8 @@ config IP_NF_PPTP
 	  Documentation/modules.txt.  If unsure, say `N'.
 
 config IP_NF_H323
-	tristate  'H.323 protocol support'
-	depends on IP_NF_CONNTRACK
+	tristate  'H.323 protocol support (EXPERIMENTAL)'
+	depends on IP_NF_CONNTRACK && EXPERIMENTAL
 	help
 	  H.323 is a VoIP signalling protocol from ITU-T. As one of the most
 	  important VoIP protocols, it is widely used by voice hardware and
@@ -181,14 +195,26 @@ config IP_NF_H323
 	  With this module you can support H.323 on a connection tracking/NAT
 	  firewall.
 
-	  This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP
-	  and T.120 based data and applications including audio, video, FAX,
-	  chat, whiteboard, file transfer, etc. For more information, please
-	  see http://nath323.sourceforge.net/.
+	  This module supports RAS, Fast Start, H.245 Tunnelling, Call
+	  Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+	  whiteboard, file transfer, etc. For more information, please
+	  visit http://nath323.sourceforge.net/.
 
 	  If you want to compile it as a module, say 'M' here and read
 	  Documentation/modules.txt.  If unsure, say 'N'.
 
+config IP_NF_SIP
+	tristate "SIP protocol support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK && EXPERIMENTAL
+	help
+	  SIP is an application-layer control protocol that can establish,
+	  modify, and terminate multimedia sessions (conferences) such as
+	  Internet telephony calls. With the ip_conntrack_sip and
+	  the ip_nat_sip modules you can support the protocol on a connection
+	  tracking/NATing firewall.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
 config IP_NF_QUEUE
 	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323
 	default IP_NF_NAT if IP_NF_H323=y
 	default m if IP_NF_H323=m
 
+config IP_NF_NAT_SIP
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_SIP=y
+	default m if IP_NF_SIP=m
+
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de7..3ded4a3af59c 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
 obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
 obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
 obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
 
 # NAT helpers 
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
 obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
 obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a604b1ccfdaa..0a7bd7f04061 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -17,33 +17,29 @@
  *	this value.
  *
  */
-
-#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
 #include <linux/moduleparam.h>
+#include <linux/textsearch.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/udp.h>
 
+#include <linux/netfilter.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
 
 static unsigned int master_timeout = 300;
+static char *ts_algo = "kmp";
 
 MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
 MODULE_DESCRIPTION("Amanda connection tracking module");
 MODULE_LICENSE("GPL");
 module_param(master_timeout, uint, 0600);
 MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
-
-static const char *conns[] = { "DATA ", "MESG ", "INDEX " };
-
-/* This is slow, but it's simple. --RR */
-static char *amanda_buffer;
-static DEFINE_SPINLOCK(amanda_buffer_lock);
+module_param(ts_algo, charp, 0400);
+MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
 
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
 				   enum ip_conntrack_info ctinfo,
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
 				   struct ip_conntrack_expect *exp);
 EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
 
+enum amanda_strings {
+	SEARCH_CONNECT,
+	SEARCH_NEWLINE,
+	SEARCH_DATA,
+	SEARCH_MESG,
+	SEARCH_INDEX,
+};
+
+static struct {
+	char			*string;
+	size_t			len;
+	struct ts_config	*ts;
+} search[] = {
+	[SEARCH_CONNECT] = {
+		.string	= "CONNECT ",
+		.len	= 8,
+	},
+	[SEARCH_NEWLINE] = {
+		.string	= "\n",
+		.len	= 1,
+	},
+	[SEARCH_DATA] = {
+		.string	= "DATA ",
+		.len	= 5,
+	},
+	[SEARCH_MESG] = {
+		.string	= "MESG ",
+		.len	= 5,
+	},
+	[SEARCH_INDEX] = {
+		.string = "INDEX ",
+		.len	= 6,
+	},
+};
+
 static int help(struct sk_buff **pskb,
                 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
 {
+	struct ts_state ts;
 	struct ip_conntrack_expect *exp;
-	char *data, *data_limit, *tmp;
-	unsigned int dataoff, i;
+	unsigned int dataoff, start, stop, off, i;
+	char pbuf[sizeof("65535")], *tmp;
 	u_int16_t port, len;
 	int ret = NF_ACCEPT;
 
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb,
 		return NF_ACCEPT;
 	}
 
-	spin_lock_bh(&amanda_buffer_lock);
-	skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
-	data = amanda_buffer;
-	data_limit = amanda_buffer + (*pskb)->len - dataoff;
-	*data_limit = '\0';
-
-	/* Search for the CONNECT string */
-	data = strstr(data, "CONNECT ");
-	if (!data)
+	memset(&ts, 0, sizeof(ts));
+	start = skb_find_text(*pskb, dataoff, (*pskb)->len,
+			      search[SEARCH_CONNECT].ts, &ts);
+	if (start == UINT_MAX)
 		goto out;
-	data += strlen("CONNECT ");
+	start += dataoff + search[SEARCH_CONNECT].len;
 
-	/* Only search first line. */	
-	if ((tmp = strchr(data, '\n')))
-		*tmp = '\0';
+	memset(&ts, 0, sizeof(ts));
+	stop = skb_find_text(*pskb, start, (*pskb)->len,
+			     search[SEARCH_NEWLINE].ts, &ts);
+	if (stop == UINT_MAX)
+		goto out;
+	stop += start;
 
-	for (i = 0; i < ARRAY_SIZE(conns); i++) {
-		char *match = strstr(data, conns[i]);
-		if (!match)
+	for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
+		memset(&ts, 0, sizeof(ts));
+		off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
+		if (off == UINT_MAX)
 			continue;
-		tmp = data = match + strlen(conns[i]);
-		port = simple_strtoul(data, &data, 10);
-		len = data - tmp;
+		off += start + search[i].len;
+
+		len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
+		if (skb_copy_bits(*pskb, off, pbuf, len))
+			break;
+		pbuf[len] = '\0';
+
+		port = simple_strtoul(pbuf, &tmp, 10);
+		len = tmp - pbuf;
 		if (port == 0 || len > 5)
 			break;
 
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb,
 		exp->mask.dst.u.tcp.port = 0xFFFF;
 
 		if (ip_nat_amanda_hook)
-			ret = ip_nat_amanda_hook(pskb, ctinfo,
-						 tmp - amanda_buffer,
+			ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff,
 						 len, exp);
 		else if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb,
 	}
 
 out:
-	spin_unlock_bh(&amanda_buffer_lock);
 	return ret;
 }
 
 static struct ip_conntrack_helper amanda_helper = {
-	.max_expected = ARRAY_SIZE(conns),
+	.max_expected = 3,
 	.timeout = 180,
 	.me = THIS_MODULE,
 	.help = help,
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = {
 
 static void __exit ip_conntrack_amanda_fini(void)
 {
+	int i;
+
 	ip_conntrack_helper_unregister(&amanda_helper);
-	kfree(amanda_buffer);
+	for (i = 0; i < ARRAY_SIZE(search); i++)
+		textsearch_destroy(search[i].ts);
 }
 
 static int __init ip_conntrack_amanda_init(void)
 {
-	int ret;
-
-	amanda_buffer = kmalloc(65536, GFP_KERNEL);
-	if (!amanda_buffer)
-		return -ENOMEM;
-
-	ret = ip_conntrack_helper_register(&amanda_helper);
-	if (ret < 0) {
-		kfree(amanda_buffer);
-		return ret;
+	int ret, i;
+
+	ret = -ENOMEM;
+	for (i = 0; i < ARRAY_SIZE(search); i++) {
+		search[i].ts = textsearch_prepare(ts_algo, search[i].string,
+						  search[i].len,
+						  GFP_KERNEL, TS_AUTOLOAD);
+		if (search[i].ts == NULL)
+			goto err;
 	}
+	ret = ip_conntrack_helper_register(&amanda_helper);
+	if (ret < 0)
+		goto err;
 	return 0;
 
-
+err:
+	for (; i >= 0; i--) {
+		if (search[i].ts)
+			textsearch_destroy(search[i].ts);
+	}
+	return ret;
 }
 
 module_init(ip_conntrack_amanda_init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 979a2eac6f00..7e4cf9a4d15f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
 		/* this is ugly, but there is no other place where to put it */
 		conntrack->nat.masq_index = exp->master->nat.masq_index;
 #endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
+#endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
 	} else {
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 
 	write_lock_bh(&ip_conntrack_lock);
 
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+		write_unlock_bh(&ip_conntrack_lock);
+		return;
+	}
+
 	/* If not in hash table, timer will not be active yet */
 	if (!is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
@@ -1318,6 +1327,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 			.tuple.dst.u.tcp.port;
 		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
 			.tuple.dst.ip;
+		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
 
 		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
 		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 3e542bf28a9d..4dcf526c3944 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char);
 static int try_epsv_response(const char *, size_t, u_int32_t [], char);
 
 static const struct ftp_search {
-	enum ip_conntrack_dir dir;
 	const char *pattern;
 	size_t plen;
 	char skip;
 	char term;
 	enum ip_ct_ftp_type ftptype;
 	int (*getnum)(const char *, size_t, u_int32_t[], char);
-} search[] = {
-	{
-		IP_CT_DIR_ORIGINAL,
-		"PORT",	sizeof("PORT") - 1, ' ', '\r',
-		IP_CT_FTP_PORT,
-		try_rfc959,
+} search[IP_CT_DIR_MAX][2] = {
+	[IP_CT_DIR_ORIGINAL] = {
+		{
+			.pattern	=  "PORT",
+			.plen		= sizeof("PORT") - 1,
+			.skip		= ' ',
+			.term		= '\r',
+			.ftptype	= IP_CT_FTP_PORT,
+			.getnum		= try_rfc959,
+		},
+		{
+			.pattern	= "EPRT",
+			.plen		= sizeof("EPRT") - 1,
+			.skip		= ' ',
+			.term		= '\r',
+			.ftptype	= IP_CT_FTP_EPRT,
+			.getnum		= try_eprt,
+		},
 	},
-	{
-		IP_CT_DIR_REPLY,
-		"227 ",	sizeof("227 ") - 1, '(', ')',
-		IP_CT_FTP_PASV,
-		try_rfc959,
-	},
-	{
-		IP_CT_DIR_ORIGINAL,
-		"EPRT", sizeof("EPRT") - 1, ' ', '\r',
-		IP_CT_FTP_EPRT,
-		try_eprt,
-	},
-	{
-		IP_CT_DIR_REPLY,
-		"229 ", sizeof("229 ") - 1, '(', ')',
-		IP_CT_FTP_EPSV,
-		try_epsv_response,
+	[IP_CT_DIR_REPLY] = {
+		{
+			.pattern	= "227 ",
+			.plen		= sizeof("227 ") - 1,
+			.skip		= '(',
+			.term		= ')',
+			.ftptype	= IP_CT_FTP_PASV,
+			.getnum		= try_rfc959,
+		},
+		{
+			.pattern	= "229 ",
+			.plen		= sizeof("229 ") - 1,
+			.skip		= '(',
+			.term		= ')',
+			.ftptype	= IP_CT_FTP_EPSV,
+			.getnum		= try_epsv_response,
+		},
 	},
 };
 
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb,
 	array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
 	array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
 
-	for (i = 0; i < ARRAY_SIZE(search); i++) {
-		if (search[i].dir != dir) continue;
-
+	for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
 		found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
-				     search[i].pattern,
-				     search[i].plen,
-				     search[i].skip,
-				     search[i].term,
+				     search[dir][i].pattern,
+				     search[dir][i].plen,
+				     search[dir][i].skip,
+				     search[dir][i].term,
 				     &matchoff, &matchlen,
 				     array,
-				     search[i].getnum);
+				     search[dir][i].getnum);
 		if (found) break;
 	}
 	if (found == -1) {
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb,
 		   this case. */
 		if (net_ratelimit())
 			printk("conntrack_ftp: partial %s %u+%u\n",
-			       search[i].pattern,
+			       search[dir][i].pattern,
 			       ntohl(th->seq), datalen);
 		ret = NF_DROP;
 		goto out;
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb,
 	/* Now, NAT might want to mangle the packet, and register the
 	 * (possibly changed) expectation itself. */
 	if (ip_nat_ftp_hook)
-		ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+		ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
 				      matchoff, matchlen, exp, &seq);
 	else {
 		/* Can't expect this?  Best to drop packet now. */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39ec..0665674218c6 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
 #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
 #include <linux/netfilter_ipv4/ip_conntrack_h323.h>
 #include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 
 #if 0
 #define DEBUGP printk
@@ -38,6 +40,12 @@ static int gkrouted_only = 1;
 module_param(gkrouted_only, int, 0600);
 MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
 
+static int callforward_filter = 1;
+module_param(callforward_filter, bool, 0600);
+MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
+		                     "if both endpoints are on different sides "
+				     "(determined by routing information)");
+
 /* Hooks for NAT */
 int (*set_h245_addr_hook) (struct sk_buff ** pskb,
 			   unsigned char **data, int dataoff,
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
 		      unsigned char **data, int dataoff,
 		      TransportAddress * addr, u_int16_t port,
 		      struct ip_conntrack_expect * exp);
+int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+				struct ip_conntrack * ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				TransportAddress * addr, u_int16_t port,
+				struct ip_conntrack_expect * exp);
 int (*nat_q931_hook) (struct sk_buff ** pskb,
 		      struct ip_conntrack * ct,
 		      enum ip_conntrack_info ctinfo,
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
 	return ret;
 }
 
+/* Forwarding declaration */
+void ip_conntrack_q931_expect(struct ip_conntrack *new,
+			      struct ip_conntrack_expect *this);
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff **pskb,
+				 struct ip_conntrack *ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data, int dataoff,
+				 TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	/* Read alternativeAddress */
+	if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
+		return 0;
+
+	/* If the calling party is on the same side of the forward-to party,
+	 * we don't need to track the second call */
+	if (callforward_filter) {
+		struct rtable *rt1, *rt2;
+		struct flowi fl1 = {
+			.fl4_dst = ip,
+		};
+		struct flowi fl2 = {
+			.fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
+		};
+
+		if (ip_route_output_key(&rt1, &fl1) == 0) {
+			if (ip_route_output_key(&rt2, &fl2) == 0) {
+				if (rt1->rt_gateway == rt2->rt_gateway &&
+				    rt1->u.dst.dev  == rt2->u.dst.dev)
+					ret = 1;
+				dst_release(&rt2->u.dst);
+			}
+			dst_release(&rt1->u.dst);
+		}
+		if (ret) {
+			DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
+			return 0;
+		}
+	}
+
+	/* Create expect for the second call leg */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = 0;
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
+		/* Need NAT */
+		ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
+					      addr, port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = ip_conntrack_q931_expect;
+
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_q931: expect Call Forwarding "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
 /****************************************************************************/
 static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
 			 enum ip_conntrack_info ctinfo,
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
 
 	DEBUGP("ip_ct_q931: Facility\n");
 
+	if (facility->reason.choice == eFacilityReason_callForwarded) {
+		if (facility->options & eFacility_UUIE_alternativeAddress)
+			return expect_callforwarding(pskb, ct, ctinfo, data,
+						     dataoff,
+						     &facility->
+						     alternativeAddress);
+		return 0;
+	}
+
 	if (facility->options & eFacility_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
 				  &facility->h245Address);
@@ -1677,7 +1786,6 @@ static int __init init(void)
 		fini();
 		return ret;
 	}
-
 	DEBUGP("ip_ct_h323: init success\n");
 	return 0;
 }
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
 EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
 EXPORT_SYMBOL_GPL(nat_t120_hook);
 EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
 EXPORT_SYMBOL_GPL(nat_q931_hook);
 
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c9..4b359618bedd 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
  *
  * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
 
 static field_t _Facility_UUIE[] = {	/* SEQUENCE */
 	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
-	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0,
-	 _TransportAddress},
+	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
 	{FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
 	 _Facility_UUIE_alternativeAliasAddress},
 	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 7d3ba4302e9e..8ccfe17bb253 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -469,8 +469,8 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 			DEBUGP("%s but no session\n", pptp_msg_name[msg]);
 			break;
 		}
-		if (info->sstate != PPTP_CALL_IN_REP
-		    && info->sstate != PPTP_CALL_IN_CONF) {
+		if (info->cstate != PPTP_CALL_IN_REP
+		    && info->cstate != PPTP_CALL_IN_CONF) {
 			DEBUGP("%s but never sent IN_CALL_REPLY\n",
 				pptp_msg_name[msg]);
 			break;
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab9367..33891bb1fde4 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -399,38 +399,54 @@ nfattr_failure:
 static int ctnetlink_done(struct netlink_callback *cb)
 {
 	DEBUGP("entered %s\n", __FUNCTION__);
+	if (cb->args[1])
+		ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
 	return 0;
 }
 
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack *ct, *last;
 	struct ip_conntrack_tuple_hash *h;
 	struct list_head *i;
-	u_int32_t *id = (u_int32_t *) &cb->args[1];
 
 	DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
 			cb->args[0], *id);
 
 	read_lock_bh(&ip_conntrack_lock);
-	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
+restart:
+		last = (struct ip_conntrack *)cb->args[1];
 		list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
 			h = (struct ip_conntrack_tuple_hash *) i;
 			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
 				continue;
 			ct = tuplehash_to_ctrack(h);
-			if (ct->id <= *id)
-				continue;
+			if (last != NULL) {
+				if (ct == last) {
+					ip_conntrack_put(last);
+					cb->args[1] = 0;
+					last = NULL;
+				} else
+					continue;
+			}
 			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
 		                        	cb->nlh->nlmsg_seq,
 						IPCTNL_MSG_CT_NEW,
-						1, ct) < 0)
+						1, ct) < 0) {
+				nf_conntrack_get(&ct->ct_general);
+				cb->args[1] = (unsigned long)ct;
 				goto out;
-			*id = ct->id;
+			}
+		}
+		if (last != NULL) {
+			ip_conntrack_put(last);
+			cb->args[1] = 0;
+			goto restart;
 		}
 	}
-out:	
+out:
 	read_unlock_bh(&ip_conntrack_lock);
 
 	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
 		    const struct ip_conntrack *ct, struct ip_nat_range *range)
 {
 	struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
 
 	memset(range, 0, sizeof(*range));
 	
-	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+	nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
 
 	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
 		return -EINVAL;
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EINVAL;
 
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
 		return -EINVAL;
 #else
-		unsigned int hooknum;
 		struct ip_nat_range range;
 
-		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
-			return -EINVAL;
-
-		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
-		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
-		       htons(range.min.all), htons(range.max.all));
-		
-		/* This is tricky but it works. ip_nat_setup_info needs the
-		 * hook number as parameter, so let's do the correct 
-		 * conversion and run away */
-		if (status & IPS_SRC_NAT_DONE)
-			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
-		else if (status & IPS_DST_NAT_DONE)
-			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
-		else 
-			return -EINVAL; /* Missing NAT flags */
-
-		DEBUGP("NAT status: %lu\n", 
-		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-		
-		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			return -EEXIST;
-		ip_nat_setup_info(ct, &range, hooknum);
-
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		if (cda[CTA_NAT_DST-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_PRE_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+		}
+		if (cda[CTA_NAT_SRC-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_POST_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+		}
 #endif
 	}
 
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	/* implicit 'else' */
 
 	/* we only allow nat config for new conntracks */
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 56794797d55b..21ee124c0463 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
 }
 
 /* look up the source key for a given tuple */
-static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t)
+static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
 {
 	struct ip_ct_gre_keymap *km;
-	u_int32_t key = 0;
+	__be16 key = 0;
 
 	read_lock_bh(&ip_ct_gre_lock);
 	km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
 			   struct ip_conntrack_tuple *tuple)
 {
 	struct gre_hdr_pptp _pgrehdr, *pgrehdr;
-	u_int32_t srckey;
+	__be16 srckey;
 	struct gre_hdr _grehdr, *grehdr;
 
 	/* first only delinearize old RFC1701 GRE header */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a6..23f1c504586d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58ad..c5c2ce5cdeb8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783b..9b2c16b4d2ff 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	 * because the semantic of CHECKSUM_HW is different there 
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
 		if (LOG_INVALID(IPPROTO_UDP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 000000000000..fc87ce0da40d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+
+#define MAX_PORTS	8
+static unsigned short ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of sip servers");
+
+static unsigned int sip_timeout = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack *ct,
+				const char **dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
+
+unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack_expect *exp,
+				const char *dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
+
+int ct_sip_get_info(const char *dptr, size_t dlen,
+				unsigned int *matchoff,
+				unsigned int *matchlen,
+				struct sip_header_nfo *hnfo);
+EXPORT_SYMBOL_GPL(ct_sip_get_info);
+
+
+static int digits_len(const char *dptr, const char *limit, int *shift);
+static int epaddr_len(const char *dptr, const char *limit, int *shift);
+static int skp_digits_len(const char *dptr, const char *limit, int *shift);
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
+
+struct sip_header_nfo ct_sip_hdrs[] = {
+	{ 	/* Via header */
+		.lname		= "Via:",
+		.lnlen		= sizeof("Via:") - 1,
+		.sname		= "\r\nv:",
+		.snlen		= sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
+		.ln_str		= "UDP ",
+		.ln_strlen	= sizeof("UDP ") - 1,
+		.match_len	= epaddr_len,
+	},
+	{ 	/* Contact header */
+		.lname		= "Contact:",
+		.lnlen		= sizeof("Contact:") - 1,
+		.sname		= "\r\nm:",
+		.snlen		= sizeof("\r\nm:") - 1,
+		.ln_str		= "sip:",
+		.ln_strlen	= sizeof("sip:") - 1,
+		.match_len	= skp_epaddr_len
+	},
+	{ 	/* Content length header */
+		.lname		= "Content-Length:",
+		.lnlen		= sizeof("Content-Length:") - 1,
+		.sname		= "\r\nl:",
+		.snlen		= sizeof("\r\nl:") - 1,
+		.ln_str		= ":",
+		.ln_strlen	= sizeof(":") - 1,
+		.match_len	= skp_digits_len
+	},
+	{	/* SDP media info */
+		.lname		= "\nm=",
+		.lnlen		= sizeof("\nm=") - 1,
+		.sname		= "\rm=",
+		.snlen		= sizeof("\rm=") - 1,
+		.ln_str		= "audio ",
+		.ln_strlen	= sizeof("audio ") - 1,
+		.match_len	= digits_len
+	},
+	{ 	/* SDP owner address*/
+		.lname		= "\no=",
+		.lnlen		= sizeof("\no=") - 1,
+		.sname		= "\ro=",
+		.snlen		= sizeof("\ro=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP connection info */
+		.lname		= "\nc=",
+		.lnlen		= sizeof("\nc=") - 1,
+		.sname		= "\rc=",
+		.snlen		= sizeof("\rc=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* Requests headers */
+		.lname		= "sip:",
+		.lnlen		= sizeof("sip:") - 1,
+		.sname		= "sip:",
+		.snlen		= sizeof("sip:") - 1, /* yes, i know.. ;) */
+		.ln_str		= "@",
+		.ln_strlen	= sizeof("@") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP version header */
+		.lname		= "\nv=",
+		.lnlen		= sizeof("\nv=") - 1,
+		.sname		= "\rv=",
+		.snlen		= sizeof("\rv=") - 1,
+		.ln_str		= "=",
+		.ln_strlen	= sizeof("=") - 1,
+		.match_len	= digits_len
+	}
+};
+EXPORT_SYMBOL_GPL(ct_sip_hdrs);
+
+/* get line lenght until first CR or LF seen. */
+int ct_sip_lnlen(const char *line, const char *limit)
+{
+	const char *k = line;
+
+	while ((line <= limit) && (*line == '\r' || *line == '\n'))
+		line++;
+
+	while (line <= limit) {
+		if (*line == '\r' || *line == '\n')
+			break;
+		line++;
+	}
+	return line - k;
+}
+EXPORT_SYMBOL_GPL(ct_sip_lnlen);
+
+/* Linear string search, case sensitive. */
+const char *ct_sip_search(const char *needle, const char *haystack,
+                          size_t needle_len, size_t haystack_len)
+{
+	const char *limit = haystack + (haystack_len - needle_len);
+
+	while (haystack <= limit) {
+		if (memcmp(haystack, needle, needle_len) == 0)
+			return haystack;
+		haystack++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ct_sip_search);
+
+static int digits_len(const char *dptr, const char *limit, int *shift)
+{
+	int len = 0;
+	while (dptr <= limit && isdigit(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+/* get digits lenght, skiping blank spaces. */
+static int skp_digits_len(const char *dptr, const char *limit, int *shift)
+{
+	for (; dptr <= limit && *dptr == ' '; dptr++)
+		(*shift)++;
+
+	return digits_len(dptr, limit, shift);
+}
+
+/* Simple ipaddr parser.. */
+static int parse_ipaddr(const char *cp,	const char **endp,
+			u_int32_t *ipaddr, const char *limit)
+{
+	unsigned long int val;
+	int i, digit = 0;
+
+	for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
+		digit = 0;
+		if (!isdigit(*cp))
+			break;
+
+		val = simple_strtoul(cp, (char **)&cp, 10);
+		if (val > 0xFF)
+			return -1;
+
+		((u_int8_t *)ipaddr)[i] = val;
+		digit = 1;
+
+		if (*cp != '.')
+			break;
+		cp++;
+	}
+	if (!digit)
+		return -1;
+
+	if (endp)
+		*endp = cp;
+
+	return 0;
+}
+
+/* skip ip address. returns it lenght. */
+static int epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	const char *aux = dptr;
+	u_int32_t ip;
+
+	if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
+		DEBUGP("ip: %s parse failed.!\n", dptr);
+		return 0;
+	}
+
+	/* Port number */
+	if (*dptr == ':') {
+		dptr++;
+		dptr += digits_len(dptr, limit, shift);
+	}
+	return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	int s = *shift;
+
+	for (; dptr <= limit && *dptr != '@'; dptr++)
+		(*shift)++;
+
+	if (*dptr == '@') {
+		dptr++;
+		(*shift)++;
+	} else
+		*shift = s;
+
+	return epaddr_len(dptr, limit, shift);
+}
+
+/* Returns 0 if not found, -1 error parsing. */
+int ct_sip_get_info(const char *dptr, size_t dlen,
+		    unsigned int *matchoff,
+		    unsigned int *matchlen,
+		    struct sip_header_nfo *hnfo)
+{
+	const char *limit, *aux, *k = dptr;
+	int shift = 0;
+
+	limit = dptr + (dlen - hnfo->lnlen);
+
+	while (dptr <= limit) {
+		if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
+		    (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
+			dptr++;
+			continue;
+		}
+		aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
+		                    ct_sip_lnlen(dptr, limit));
+		if (!aux) {
+			DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
+			       hnfo->lname);
+			return -1;
+		}
+		aux += hnfo->ln_strlen;
+
+		*matchlen = hnfo->match_len(aux, limit, &shift);
+		if (!*matchlen)
+			return -1;
+
+		*matchoff = (aux - k) + shift;
+
+		DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
+		       *matchlen);
+		return 1;
+	}
+	DEBUGP("%s header not found.\n", hnfo->lname);
+	return 0;
+}
+
+static int set_expected_rtp(struct sk_buff **pskb,
+			    struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    u_int32_t ipaddr, u_int16_t port,
+			    const char *dptr)
+{
+	struct ip_conntrack_expect *exp;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	int ret;
+
+	exp = ip_conntrack_expect_alloc(ct);
+	if (exp == NULL)
+		return NF_DROP;
+
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.udp.port = 0;
+	exp->tuple.dst.ip = ipaddr;
+	exp->tuple.dst.u.udp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_UDP;
+
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.udp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.udp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+
+	exp->expectfn = NULL;
+	exp->flags = 0;
+
+	if (ip_nat_sdp_hook)
+		ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
+	else {
+		if (ip_conntrack_expect_related(exp) != 0)
+			ret = NF_DROP;
+		else
+			ret = NF_ACCEPT;
+	}
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+static int sip_help(struct sk_buff **pskb,
+		    struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const char *dptr;
+	int ret = NF_ACCEPT;
+	int matchoff, matchlen;
+	u_int32_t ipaddr;
+	u_int16_t port;
+
+	/* No Data ? */
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	if (dataoff >= (*pskb)->len) {
+		DEBUGP("skb->len = %u\n", (*pskb)->len);
+		return NF_ACCEPT;
+        }
+
+	ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
+
+	if (!skb_is_nonlinear(*pskb))
+		dptr = (*pskb)->data + dataoff;
+	else {
+		DEBUGP("Copy of skbuff not supported yet.\n");
+		goto out;
+	}
+
+	if (ip_nat_sip_hook) {
+		if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
+			ret = NF_DROP;
+			goto out;
+		}
+	}
+
+	/* After this point NAT, could have mangled skb, so
+	   we need to recalculate payload lenght. */
+	datalen = (*pskb)->len - dataoff;
+
+	if (datalen < (sizeof("SIP/2.0 200") - 1))
+		goto out;
+
+	/* RTP info only in some SDP pkts */
+	if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
+	    memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
+		goto out;
+	}
+	/* Get ip and port address from SDP packet. */
+	if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+	                    &ct_sip_hdrs[POS_CONNECTION]) > 0) {
+
+		/* We'll drop only if there are parse problems. */
+		if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
+		                 dptr + datalen) < 0) {
+			ret = NF_DROP;
+			goto out;
+		}
+		if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+		                    &ct_sip_hdrs[POS_MEDIA]) > 0) {
+
+			port = simple_strtoul(dptr + matchoff, NULL, 10);
+			if (port < 1024) {
+				ret = NF_DROP;
+				goto out;
+			}
+			ret = set_expected_rtp(pskb, ct, ctinfo,
+					       ipaddr, port, dptr);
+		}
+	}
+out:
+	return ret;
+}
+
+static struct ip_conntrack_helper sip[MAX_PORTS];
+static char sip_names[MAX_PORTS][10];
+
+static void fini(void)
+{
+	int i;
+	for (i = 0; i < ports_c; i++) {
+		DEBUGP("unregistering helper for port %d\n", ports[i]);
+		ip_conntrack_helper_unregister(&sip[i]);
+	}
+}
+
+static int __init init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = SIP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		/* Create helper structure */
+		memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
+
+		sip[i].tuple.dst.protonum = IPPROTO_UDP;
+		sip[i].tuple.src.u.udp.port = htons(ports[i]);
+		sip[i].mask.src.u.udp.port = 0xFFFF;
+		sip[i].mask.dst.protonum = 0xFF;
+		sip[i].max_expected = 1;
+		sip[i].timeout = 3 * 60; /* 3 minutes */
+		sip[i].me = THIS_MODULE;
+		sip[i].help = sip_help;
+
+		tmpname = &sip_names[i][0];
+		if (ports[i] == SIP_PORT)
+			sprintf(tmpname, "sip");
+		else
+			sprintf(tmpname, "sip-%d", i);
+		sip[i].name = tmpname;
+
+		DEBUGP("port #%d: %d\n", i, ports[i]);
+
+		ret = ip_conntrack_helper_register(&sip[i]);
+		if (ret) {
+			printk("ERROR registering helper for port %d\n",
+				ports[i]);
+			fini();
+			return ret;
+		}
+	}
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 929d61f7be91..88445aac3f28 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum,
 
 	/* This is where we call the helper: as the packet goes out. */
 	ct = ip_conntrack_get(*pskb, &ctinfo);
-	if (ct && ct->helper) {
+	if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
 		unsigned int ret;
 		ret = ct->helper->help(pskb, ct, ctinfo);
 		if (ret != NF_ACCEPT)
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int ip_conntrack_checksum = 1;
+
 static struct ctl_table_header *ip_ct_sysctl_header;
 
 static ctl_table ip_ct_sysctl_table[] = {
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_CHECKSUM,
+		.procname	= "ip_conntrack_checksum",
+		.data		= &ip_conntrack_checksum,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
 		.data		= &ip_ct_tcp_timeout_syn_sent,
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
 EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a7..419b878fb467 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
 }
 
 /****************************************************************************/
+static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
+					 struct ip_conntrack_expect *this)
+{
+	struct ip_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
+
+	/* hook doesn't matter, but it has to do source manip */
+	ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip = this->saved_ip;
+
+	/* hook doesn't matter, but it has to do destination manip */
+	ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+
+	ip_conntrack_q931_expect(new, this);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
+			      enum ip_conntrack_info ctinfo,
+			      unsigned char **data, int dataoff,
+			      TransportAddress * addr, u_int16_t port,
+			      struct ip_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	exp->saved_ip = exp->tuple.dst.ip;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_callforwarding_expect;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (nated_port = port; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (!set_h225_addr(pskb, data, dataoff, addr,
+			   ct->tuplehash[!dir].tuple.dst.ip,
+			   nated_port) == 0) {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	DEBUGP("ip_nat_q931: expect Call Forwarding "
+	       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
 static int __init init(void)
 {
 	BUG_ON(set_h245_addr_hook != NULL);
@@ -496,6 +570,7 @@ static int __init init(void)
 	BUG_ON(nat_rtp_rtcp_hook != NULL);
 	BUG_ON(nat_t120_hook != NULL);
 	BUG_ON(nat_h245_hook != NULL);
+	BUG_ON(nat_callforwarding_hook != NULL);
 	BUG_ON(nat_q931_hook != NULL);
 
 	set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
 	nat_rtp_rtcp_hook = nat_rtp_rtcp;
 	nat_t120_hook = nat_t120;
 	nat_h245_hook = nat_h245;
+	nat_callforwarding_hook = nat_callforwarding;
 	nat_q931_hook = nat_q931;
 
 	DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
 	nat_rtp_rtcp_hook = NULL;
 	nat_t120_hook = NULL;
 	nat_h245_hook = NULL;
+	nat_callforwarding_hook = NULL;
 	nat_q931_hook = NULL;
 	synchronize_net();
 }
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 000000000000..6ffba63adca2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
+/* SIP extension for UDP NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+extern struct sip_header_nfo ct_sip_hdrs[];
+
+static unsigned int mangle_sip_packet(struct sk_buff **pskb,
+				      enum ip_conntrack_info ctinfo,
+				      struct ip_conntrack *ct,
+				      const char **dptr, size_t dlen,
+				      char *buffer, int bufflen,
+				      struct sip_header_nfo *hnfo)
+{
+	unsigned int matchlen, matchoff;
+
+	if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
+		return 0;
+
+	if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+	                              matchoff, matchlen, buffer, bufflen))
+		return 0;
+
+	/* We need to reload this. Thanks Patrick. */
+	*dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	return 1;
+}
+
+static unsigned int ip_nat_sip(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       const char **dptr)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+	unsigned int bufflen, dataoff;
+	u_int32_t ip;
+	u_int16_t port;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	ip   = ct->tuplehash[!dir].tuple.dst.ip;
+	port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+	bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
+
+	/* short packet ? */
+	if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
+		return 0;
+
+	/* Basic rules: requests and responses. */
+	if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
+		const char *aux;
+
+		if ((ctinfo) < IP_CT_IS_REPLY) {
+			mangle_sip_packet(pskb, ctinfo, ct, dptr,
+			                  (*pskb)->len - dataoff,
+			                  buffer, bufflen,
+			                  &ct_sip_hdrs[POS_CONTACT]);
+			return 1;
+		}
+
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* This search should ignore case, but later.. */
+		aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
+		                    (*pskb)->len - dataoff);
+		if (!aux)
+			return 0;
+
+		if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
+		    ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
+			return 1;
+
+		return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+					 (*pskb)->len - dataoff,
+		                         buffer, bufflen,
+					 &ct_sip_hdrs[POS_CONTACT]);
+	}
+	if ((ctinfo) < IP_CT_IS_REPLY) {
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* Mangle Contact if exists only. - watch udp_nat_mangle()! */
+		mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
+		                  buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
+		return 1;
+	}
+	/* This mangle requests headers. */
+	return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+	                         ct_sip_lnlen(*dptr,
+				              *dptr + (*pskb)->len - dataoff),
+	                         buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
+}
+
+static int mangle_content_len(struct sk_buff **pskb,
+			      enum ip_conntrack_info ctinfo,
+			      struct ip_conntrack *ct,
+			      const char *dptr)
+{
+	unsigned int dataoff, matchoff, matchlen;
+	char buffer[sizeof("65536")];
+	int bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Get actual SDP lenght */
+	if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+	                    &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
+
+		/* since ct_sip_get_info() give us a pointer passing 'v='
+		   we need to add 2 bytes in this count. */
+		int c_len = (*pskb)->len - dataoff - matchoff + 2;
+
+		/* Now, update SDP lenght */
+		if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+		                    &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
+
+			bufflen = sprintf(buffer, "%u", c_len);
+
+			return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+							matchoff, matchlen,
+							buffer, bufflen);
+		}
+	}
+	return 0;
+}
+
+static unsigned int mangle_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       u_int32_t newip, u_int16_t port,
+			       const char *dptr)
+{
+	char buffer[sizeof("nnn.nnn.nnn.nnn")];
+	unsigned int dataoff, bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Mangle owner and contact info. */
+	bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
+		return 0;
+
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
+		return 0;
+
+	/* Mangle media port. */
+	bufflen = sprintf(buffer, "%u", port);
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
+		return 0;
+
+	return mangle_content_len(pskb, ctinfo, ct, dptr);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack_expect *exp,
+			       const char *dptr)
+{
+	struct ip_conntrack *ct = exp->master;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	u_int32_t newip;
+	u_int16_t port;
+
+	DEBUGP("ip_nat_sdp():\n");
+
+	/* Connection will come from reply */
+	newip = ct->tuplehash[!dir].tuple.dst.ip;
+
+	exp->tuple.dst.ip = newip;
+	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+	exp->dir = !dir;
+
+	/* When you see the packet, we need to NAT it the same as the
+	   this one. */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
+		exp->tuple.dst.u.udp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
+		ip_conntrack_unexpect_related(exp);
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_sip_hook = NULL;
+	ip_nat_sdp_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_sip_hook);
+	BUG_ON(ip_nat_sdp_hook);
+	ip_nat_sip_hook = ip_nat_sip;
+	ip_nat_sdp_hook = ip_nat_sdp;
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index c33244263b90..d20d557f915a 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void)
 module_init(ip_nat_snmp_basic_init);
 module_exit(ip_nat_snmp_basic_fini);
 
-module_param(debug, bool, 0600);
+module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index aad9d28c8d71..dbc83c5d7aa6 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
 	struct iphdr *iph = skb->nh.iph;
 	unsigned long hashval;
 	u_int16_t sport, dport;
-	struct tcphdr *th;
-	struct udphdr *uh;
-	struct icmphdr *ih;
+	u_int16_t *ports;
 
 	switch (iph->protocol) {
 	case IPPROTO_TCP:
-		th = (void *)iph+iph->ihl*4;
-		sport = ntohs(th->source);
-		dport = ntohs(th->dest);
-		break;
 	case IPPROTO_UDP:
-		uh = (void *)iph+iph->ihl*4;
-		sport = ntohs(uh->source);
-		dport = ntohs(uh->dest);
-		break;
+	case IPPROTO_SCTP:
+	case IPPROTO_DCCP:
 	case IPPROTO_ICMP:
-		ih = (void *)iph+iph->ihl*4;
-		sport = ntohs(ih->un.echo.id);
-		dport = (ih->type<<8)|ih->code;
+		ports = (void *)iph+iph->ihl*4;
+		sport = ports[0];
+		dport = ports[1];
 		break;
 	default:
 		if (net_ratelimit()) {
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb786..431a3ce6f7b7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
 	nskb->nfmark = 0;
+	skb_init_secmark(nskb);
 
 	tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
 
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 7c6836c4646e..92980ab8ce48 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -28,9 +28,6 @@
 #include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/sctp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/list.h>
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable {
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
 	u_int32_t rnd;			/* random seed for hash */
+	int rnd_initialized;
 	struct timer_list timer;	/* timer for gc */
 	atomic_t count;			/* number entries in table */
 
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
 
 	/* initialize hash with random val at the time we allocate
 	 * the first hashtable entry */
-	if (!ht->rnd)
+	if (!ht->rnd_initialized) {
 		get_random_bytes(&ht->rnd, 4);
+		ht->rnd_initialized = 1;
+	}
 
 	if (ht->cfg.max &&
 	    atomic_read(&ht->count) >= ht->cfg.max) {
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
 
 	atomic_set(&hinfo->count, 0);
 	atomic_set(&hinfo->use, 1);
-	hinfo->rnd = 0;
+	hinfo->rnd_initialized = 0;
 	spin_lock_init(&hinfo->lock);
 	hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
 	if (!hinfo->pde) {
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
 		dh->rateinfo.credit = dh->rateinfo.credit_cap;
 }
 
-static inline int get_ports(const struct sk_buff *skb, int offset, 
-			    u16 ports[2])
-{
-	union {
-		struct tcphdr th;
-		struct udphdr uh;
-		sctp_sctphdr_t sctph;
-	} hdr_u, *ptr_u;
-
-	/* Must not be a fragment. */
-	if (offset)
-		return 1;
-
-	/* Must be big enough to read ports (both UDP and TCP have
-	   them at the start). */
-	ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); 
-	if (!ptr_u)
-		return 1;
-
-	switch (skb->nh.iph->protocol) {
-		case IPPROTO_TCP:
-			ports[0] = ptr_u->th.source;
-			ports[1] = ptr_u->th.dest;
-			break;
-		case IPPROTO_UDP:
-			ports[0] = ptr_u->uh.source;
-			ports[1] = ptr_u->uh.dest;
-			break;
-		case IPPROTO_SCTP:
-			ports[0] = ptr_u->sctph.source;
-			ports[1] = ptr_u->sctph.dest;
-			break;
-		default:
-			/* all other protocols don't supprot per-port hash
-			 * buckets */
-			ports[0] = ports[1] = 0;
-			break;
-	}
-
-	return 0;
-}
-
-
 static int
 hashlimit_match(const struct sk_buff *skb,
 		const struct net_device *in,
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb,
 		dst.src_ip = skb->nh.iph->saddr;
 	if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
 	    ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
-		u_int16_t ports[2];
-		if (get_ports(skb, offset, ports)) {
+		u_int16_t _ports[2], *ports;
+
+		switch (skb->nh.iph->protocol) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+		case IPPROTO_SCTP:
+		case IPPROTO_DCCP:
+			ports = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+						   sizeof(_ports), &_ports);
+			break;
+		default:
+			_ports[0] = _ports[1] = 0;
+			ports = _ports;
+			break;
+		}
+		if (!ports) {
 			/* We've been asked to examine this packet, and we
 		 	  can't.  Hence, no choice but to drop. */
 			*hotdrop = 1;
@@ -561,7 +532,7 @@ static void
 hashlimit_destroy(const struct xt_match *match, void *matchinfo,
 		  unsigned int matchsize)
 {
-	struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
+	struct ipt_hashlimit_info *r = matchinfo;
 
 	htable_put(r->hinfo);
 }
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index b847ee409efb..61a2139f9cfd 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -1,1007 +1,499 @@
-/* Kernel module to check if the source address has been seen recently. */
-/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
-/* Author: Stephen Frost <sfrost@snowman.net> */
-/* Project Page: http://snowman.net/projects/ipt_recent/ */
-/* This software is distributed under the terms of the GPL, Version 2 */
-/* This copyright does not cover user programs that use kernel services
- * by normal system calls. */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is a replacement of the old ipt_recent module, which carried the
+ * following copyright notice:
+ *
+ * Author: Stephen Frost <sfrost@snowman.net>
+ * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
+ */
+#include <linux/init.h>
+#include <linux/moduleparam.h>
 #include <linux/proc_fs.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/ip.h>
-#include <linux/vmalloc.h>
-#include <linux/moduleparam.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ipt_recent.h>
 
-#undef DEBUG
-#define HASH_LOG 9
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IP tables recently seen matching module");
+MODULE_LICENSE("GPL");
 
-/* Defaults, these can be overridden on the module command-line. */
 static unsigned int ip_list_tot = 100;
 static unsigned int ip_pkt_list_tot = 20;
 static unsigned int ip_list_hash_size = 0;
 static unsigned int ip_list_perms = 0644;
-#ifdef DEBUG
-static int debug = 1;
-#endif
-
-static char version[] =
-KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>.  http://snowman.net/projects/ipt_recent/\n";
-
-MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
-MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
-MODULE_LICENSE("GPL");
 module_param(ip_list_tot, uint, 0400);
 module_param(ip_pkt_list_tot, uint, 0400);
 module_param(ip_list_hash_size, uint, 0400);
 module_param(ip_list_perms, uint, 0400);
-#ifdef DEBUG
-module_param(debug, bool, 0600);
-MODULE_PARM_DESC(debug,"enable debugging output");
-#endif
-MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
-MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
-MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
-MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
-
-/* Structure of our list of recently seen addresses. */
-struct recent_ip_list {
-	u_int32_t addr;
-	u_int8_t  ttl;
-	unsigned long last_seen;
-	unsigned long *last_pkts;
-	u_int32_t oldest_pkt;
-	u_int32_t hash_entry;
-	u_int32_t time_pos;
-};
-
-struct time_info_list {
-	u_int32_t position;
-	u_int32_t time;
+MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
+MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
+
+
+struct recent_entry {
+	struct list_head	list;
+	struct list_head	lru_list;
+	u_int32_t		addr;
+	u_int8_t		ttl;
+	u_int8_t		index;
+	u_int16_t		nstamps;
+	unsigned long		stamps[0];
 };
 
-/* Structure of our linked list of tables of recent lists. */
-struct recent_ip_tables {
-	char name[IPT_RECENT_NAME_LEN];
-	int count;
-	int time_pos;
-	struct recent_ip_list *table;
-	struct recent_ip_tables *next;
-	spinlock_t list_lock;
-	int *hash_table;
-	struct time_info_list *time_info;
+struct recent_table {
+	struct list_head	list;
+	char			name[IPT_RECENT_NAME_LEN];
 #ifdef CONFIG_PROC_FS
-	struct proc_dir_entry *status_proc;
-#endif /* CONFIG_PROC_FS */
+	struct proc_dir_entry	*proc;
+#endif
+	unsigned int		refcnt;
+	unsigned int		entries;
+	struct list_head	lru_list;
+	struct list_head	iphash[0];
 };
 
-/* Our current list of addresses we have recently seen.
- * Only added to on a --set, and only updated on --set || --update 
- */
-static struct recent_ip_tables *r_tables = NULL;
-
-/* We protect r_list with this spinlock so two processors are not modifying
- * the list at the same time. 
- */
+static LIST_HEAD(tables);
 static DEFINE_SPINLOCK(recent_lock);
+static DEFINE_MUTEX(recent_mutex);
 
 #ifdef CONFIG_PROC_FS
-/* Our /proc/net/ipt_recent entry */
-static struct proc_dir_entry *proc_net_ipt_recent = NULL;
-#endif
-
-/* Function declaration for later. */
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const struct xt_match *match,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop);
-
-/* Function to hash a given address into the hash table of table_size size */
-static int hash_func(unsigned int addr, int table_size)
-{
-	int result = 0;
-	unsigned int value = addr;
-	do { result ^= value; } while((value >>= HASH_LOG));
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
-			 result & (table_size - 1),
-			 addr,
-			 table_size);
+static struct proc_dir_entry	*proc_dir;
+static struct file_operations	recent_fops;
 #endif
 
-	return(result & (table_size - 1));
-}
+static u_int32_t hash_rnd;
+static int hash_rnd_initted;
 
-#ifdef CONFIG_PROC_FS
-/* This is the function which produces the output for our /proc output
- * interface which lists each IP address, the last seen time and the 
- * other recent times the address was seen.
- */
-
-static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+static unsigned int recent_entry_hash(u_int32_t addr)
 {
-	int len = 0, count, last_len = 0, pkt_count;
-	off_t pos = 0;
-	off_t begin = 0;
-	struct recent_ip_tables *curr_table;
-
-	curr_table = (struct recent_ip_tables*) data;
-
-	spin_lock_bh(&curr_table->list_lock);
-	for(count = 0; count < ip_list_tot; count++) {
-		if(!curr_table->table[count].addr) continue;
-		last_len = len;
-		len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
-		len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
-		len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
-		len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
-		len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
-		for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
-			if(!curr_table->table[count].last_pkts[pkt_count]) break;
-			len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
-		}
-		len += sprintf(buffer+len,"\n");
-		pos = begin + len;
-		if(pos < offset) { len = 0; begin = pos; }
-		if(pos > offset + length) { len = last_len; break; }
+	if (!hash_rnd_initted) {
+		get_random_bytes(&hash_rnd, 4);
+		hash_rnd_initted = 1;
 	}
-
-	*start = buffer + (offset - begin);
-	len -= (offset - begin);
-	if(len > length) len = length;
-
-	spin_unlock_bh(&curr_table->list_lock);
-	return len;
+	return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1);
 }
 
-/* ip_recent_ctrl provides an interface for users to modify the table
- * directly.  This allows adding entries, removing entries, and
- * flushing the entire table.
- * This is done by opening up the appropriate table for writing and
- * sending one of:
- * xx.xx.xx.xx   -- Add entry to table with current time
- * +xx.xx.xx.xx  -- Add entry to table with current time
- * -xx.xx.xx.xx  -- Remove entry from table
- * clear         -- Flush table, remove all entries
- */
-
-static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
+static struct recent_entry *
+recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl)
 {
-	static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
-	u_int32_t val;
-	int base, used = 0;
-	char c, *cp;
-	union iaddr {
-		uint8_t bytes[4];
-		uint32_t word;
-	} res;
-	uint8_t *pp = res.bytes;
-	int digit;
-
-	char buffer[20];
-	int len, check_set = 0, count;
-	u_int32_t addr = 0;
-	struct sk_buff *skb;
-	struct ipt_recent_info *info;
-	struct recent_ip_tables *curr_table;
-
-	curr_table = (struct recent_ip_tables*) data;
-
-	if(size > 20) len = 20; else len = size;
-
-	if(copy_from_user(buffer,input,len)) return -EFAULT;
-
-	if(len < 20) buffer[len] = '\0';
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
-#endif
+	struct recent_entry *e;
+	unsigned int h;
+
+	h = recent_entry_hash(addr);
+	list_for_each_entry(e, &table->iphash[h], list)
+		if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
+			return e;
+	return NULL;
+}
 
-	cp = buffer;
-	while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
+static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
+{
+	list_del(&e->list);
+	list_del(&e->lru_list);
+	kfree(e);
+	t->entries--;
+}
 
-	/* Check if we are asked to flush the entire table */
-	if(!memcmp(cp,"clear",5)) {
-		used += 5;
-		spin_lock_bh(&curr_table->list_lock);
-		curr_table->time_pos = 0;
-		for(count = 0; count < ip_list_hash_size; count++) {
-			curr_table->hash_table[count] = -1;
-		}
-		for(count = 0; count < ip_list_tot; count++) {
-			curr_table->table[count].last_seen = 0;
-			curr_table->table[count].addr = 0;
-			curr_table->table[count].ttl = 0;
-			memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
-			curr_table->table[count].oldest_pkt = 0;
-			curr_table->table[count].time_pos = 0;
-			curr_table->time_info[count].position = count;
-			curr_table->time_info[count].time = 0;
-		}
-		spin_unlock_bh(&curr_table->list_lock);
-		return used;
-	}
+static struct recent_entry *
+recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl)
+{
+	struct recent_entry *e;
 
-        check_set = IPT_RECENT_SET;
-	switch(*cp) {
-		case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
-		case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
-		default: if(!isdigit(*cp)) return (used+1); break;
+	if (t->entries >= ip_list_tot) {
+		e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+		recent_entry_remove(t, e);
 	}
+	e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
+		    GFP_ATOMIC);
+	if (e == NULL)
+		return NULL;
+	e->addr      = addr;
+	e->ttl       = ttl;
+	e->stamps[0] = jiffies;
+	e->nstamps   = 1;
+	e->index     = 1;
+	list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
+	list_add_tail(&e->lru_list, &t->lru_list);
+	t->entries++;
+	return e;
+}
 
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
-#endif
-	/* Get addr (effectively inet_aton()) */
-	/* Shamelessly stolen from libc, a function in the kernel for doing
-	 * this would, of course, be greatly preferred, but our options appear
-	 * to be rather limited, so we will just do it ourselves here.
-	 */
-	res.word = 0;
-
-	c = *cp;
-	for(;;) {
-		if(!isdigit(c)) return used;
-		val = 0; base = 10; digit = 0;
-		if(c == '0') {
-			c = *++cp;
-			if(c == 'x' || c == 'X') base = 16, c = *++cp;
-			else { base = 8; digit = 1; }
-		}
-		for(;;) {
-			if(isascii(c) && isdigit(c)) {
-				if(base == 8 && (c == '8' || c == '0')) return used;
-				val = (val * base) + (c - '0');
-				c = *++cp;
-				digit = 1;
-			} else if(base == 16 && isascii(c) && isxdigit(c)) {
-				val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
-				c = *++cp;
-				digit = 1;
-			} else break;
-		}
-		if(c == '.') {
-			if(pp > res.bytes + 2 || val > 0xff) return used;
-			*pp++ = val;
-			c = *++cp;
-		} else break;
-	}
-	used = cp - buffer;
-	if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
-	if(c == '\n') used++;
-	if(!digit) return used;
+static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
+{
+	e->stamps[e->index++] = jiffies;
+	if (e->index > e->nstamps)
+		e->nstamps = e->index;
+	e->index %= ip_pkt_list_tot;
+	list_move_tail(&e->lru_list, &t->lru_list);
+}
 
-	if(val > max[pp - res.bytes]) return used;
-	addr = res.word | htonl(val);
+static struct recent_table *recent_table_lookup(const char *name)
+{
+	struct recent_table *t;
 
-	if(!addr && check_set == IPT_RECENT_SET) return used;
+	list_for_each_entry(t, &tables, list)
+		if (!strcmp(t->name, name))
+			return t;
+	return NULL;
+}
 
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
-#endif
+static void recent_table_flush(struct recent_table *t)
+{
+	struct recent_entry *e, *next;
+	unsigned int i;
 
-	/* Set up and just call match */
-	info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
-	if(!info) { return -ENOMEM; }
-	info->seconds = 0;
-	info->hit_count = 0;
-	info->check_set = check_set;
-	info->invert = 0;
-	info->side = IPT_RECENT_SOURCE;
-	strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
-	info->name[IPT_RECENT_NAME_LEN-1] = '\0';
-
-	skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
-	if (!skb) {
-		used = -ENOMEM;
-		goto out_free_info;
-	}
-	skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
-	if (!skb->nh.iph) {
-		used = -ENOMEM;
-		goto out_free_skb;
+	for (i = 0; i < ip_list_hash_size; i++) {
+		list_for_each_entry_safe(e, next, &t->iphash[i], list)
+			recent_entry_remove(t, e);
 	}
-
-	skb->nh.iph->saddr = addr;
-	skb->nh.iph->daddr = 0;
-	/* Clear ttl since we have no way of knowing it */
-	skb->nh.iph->ttl = 0;
-	match(skb,NULL,NULL,NULL,info,0,0,NULL);
-
-	kfree(skb->nh.iph);
-out_free_skb:
-	kfree(skb);
-out_free_info:
-	kfree(info);
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
-#endif
-	return used;
 }
 
-#endif /* CONFIG_PROC_FS */
-
-/* 'match' is our primary function, called by the kernel whenever a rule is
- * hit with our module as an option to it.
- * What this function does depends on what was specifically asked of it by
- * the user:
- * --set -- Add or update last seen time of the source address of the packet
- *   -- matchinfo->check_set == IPT_RECENT_SET
- * --rcheck -- Just check if the source address is in the list
- *   -- matchinfo->check_set == IPT_RECENT_CHECK
- * --update -- If the source address is in the list, update last_seen
- *   -- matchinfo->check_set == IPT_RECENT_UPDATE
- * --remove -- If the source address is in the list, remove it
- *   -- matchinfo->check_set == IPT_RECENT_REMOVE
- * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
- *   -- matchinfo->seconds
- * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
- *   -- matchinfo->hit_count
- * --seconds and --hitcount can be combined
- */
 static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const struct xt_match *match,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
+ipt_recent_match(const struct sk_buff *skb,
+		 const struct net_device *in, const struct net_device *out,
+		 const struct xt_match *match, const void *matchinfo,
+		 int offset, unsigned int protoff, int *hotdrop)
 {
-	int pkt_count, hits_found, ans;
-	unsigned long now;
 	const struct ipt_recent_info *info = matchinfo;
-	u_int32_t addr = 0, time_temp;
-	u_int8_t ttl = skb->nh.iph->ttl;
-	int *hash_table;
-	int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
-	struct time_info_list *time_info;
-	struct recent_ip_tables *curr_table;
-	struct recent_ip_tables *last_table;
-	struct recent_ip_list *r_list;
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
-#endif
-
-	/* Default is false ^ info->invert */
-	ans = info->invert;
+	struct recent_table *t;
+	struct recent_entry *e;
+	u_int32_t addr;
+	u_int8_t ttl;
+	int ret = info->invert;
 
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
-#endif
+	if (info->side == IPT_RECENT_DEST)
+		addr = skb->nh.iph->daddr;
+	else
+		addr = skb->nh.iph->saddr;
 
-	/* if out != NULL then routing has been done and TTL changed.
-	 * We change it back here internally for match what came in before routing. */
-	if(out) ttl++;
+	ttl = skb->nh.iph->ttl;
+	/* use TTL as seen before forwarding */
+	if (out && !skb->sk)
+		ttl++;
 
-	/* Find the right table */
 	spin_lock_bh(&recent_lock);
-	curr_table = r_tables;
-	while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
-#endif
-
-	spin_unlock_bh(&recent_lock);
-
-	/* Table with this name not found, match impossible */
-	if(!curr_table) { return ans; }
-
-	/* Make sure no one is changing the list while we work with it */
-	spin_lock_bh(&curr_table->list_lock);
-
-	r_list = curr_table->table;
-	if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
-
-	if(!addr) { 
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
-#endif
-		spin_unlock_bh(&curr_table->list_lock);
-		return ans;
-	}
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
-#endif
-
-	/* Get jiffies now in case they changed while we were waiting for a lock */
-	now = jiffies;
-	hash_table = curr_table->hash_table;
-	time_info = curr_table->time_info;
-
-	orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
-	/* Hash entry at this result used */
-	/* Check for TTL match if requested.  If TTL is zero then a match would never
-	 * happen, so match regardless of existing TTL in that case.  Zero means the
-	 * entry was added via the /proc interface anyway, so we will just use the
-	 * first TTL we get for that IP address. */
-	if(info->check_set & IPT_RECENT_TTL) {
-		while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
-			(!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
-			/* Collision in hash table */
-			hash_result = (hash_result + 1) % ip_list_hash_size;
-		}
-	} else {
-		while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
-			/* Collision in hash table */
-			hash_result = (hash_result + 1) % ip_list_hash_size;
-		}
-	}
-
-	if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
-		/* IP not in list and not asked to SET */
-		spin_unlock_bh(&curr_table->list_lock);
-		return ans;
-	}
-
-	/* Check if we need to handle the collision, do not need to on REMOVE */
-	if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
-				 orig_hash_result,
-				 hash_result,
-				 r_list[hash_table[orig_hash_result]].addr,
-				 addr);
-#endif
-
-		/* We had a collision.
-		 * orig_hash_result is where we started, hash_result is where we ended up.
-		 * So, swap them because we are likely to see the same guy again sooner */
-#ifdef DEBUG
-		if(debug) {
-		  printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
-		  printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
-				r_list[hash_table[orig_hash_result]].hash_entry);
-		}
-#endif
-
-		r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
-
-
-		temp = hash_table[orig_hash_result];
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
-#endif
-		hash_table[orig_hash_result] = hash_table[hash_result];
-		hash_table[hash_result] = temp;
-		temp = hash_result;
-		hash_result = orig_hash_result;
-		orig_hash_result = temp;
-		time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
-		if(hash_table[hash_result] != -1) {
-			r_list[hash_table[hash_result]].hash_entry = hash_result;
-			time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-		}
-
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
-#endif
+	t = recent_table_lookup(info->name);
+	e = recent_entry_lookup(t, addr,
+				info->check_set & IPT_RECENT_TTL ? ttl : 0);
+	if (e == NULL) {
+		if (!(info->check_set & IPT_RECENT_SET))
+			goto out;
+		e = recent_entry_init(t, addr, ttl);
+		if (e == NULL)
+			*hotdrop = 1;
+		ret ^= 1;
+		goto out;
 	}
 
-	if(hash_table[hash_result] == -1) {
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
-				 hash_result, addr);
-#endif
-
-		/* New item found and IPT_RECENT_SET, so we need to add it */
-		location = time_info[curr_table->time_pos].position;
-		hash_table[r_list[location].hash_entry] = -1;
-		hash_table[hash_result] = location;
-		memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
-		r_list[location].time_pos = curr_table->time_pos;
-		r_list[location].addr = addr;
-		r_list[location].ttl = ttl;
-		r_list[location].last_seen = now;
-		r_list[location].oldest_pkt = 1;
-		r_list[location].last_pkts[0] = now;
-		r_list[location].hash_entry = hash_result;
-		time_info[curr_table->time_pos].time = r_list[location].last_seen;
-		curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
-
-		ans = !info->invert;
-	} else {
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
-				 hash_result,
-				 addr);
-#endif
-
-		/* Existing item found */
-		location = hash_table[hash_result];
-		/* We have a match on address, now to make sure it meets all requirements for a
-		 * full match. */
-		if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
-			if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
-			if(info->seconds && !info->hit_count) {
-				if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
-			}
-			if(info->seconds && info->hit_count) {
-				for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
-					if(r_list[location].last_pkts[pkt_count] == 0) break;
-					if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
-				}
-				if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
-			}
-			if(info->hit_count && !info->seconds) {
-				for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
-					if(r_list[location].last_pkts[pkt_count] == 0) break;
-					hits_found++;
-				}
-				if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+	if (info->check_set & IPT_RECENT_SET)
+		ret ^= 1;
+	else if (info->check_set & IPT_RECENT_REMOVE) {
+		recent_entry_remove(t, e);
+		ret ^= 1;
+	} else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
+		unsigned long t = jiffies - info->seconds * HZ;
+		unsigned int i, hits = 0;
+
+		for (i = 0; i < e->nstamps; i++) {
+			if (info->seconds && time_after(t, e->stamps[i]))
+				continue;
+			if (++hits >= info->hit_count) {
+				ret ^= 1;
+				break;
 			}
 		}
-#ifdef DEBUG
-		if(debug) {
-			if(ans)
-				printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
-			else
-				printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
-		}
-#endif
-
-		/* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
-		 * current timestamp to the last_seen. */
-		if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
-#ifdef DEBUG
-			if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
-#endif
-			/* Have to update our time info */
-			time_loc = r_list[location].time_pos;
-			time_info[time_loc].time = now;
-			time_info[time_loc].position = location;
-			while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
-				time_temp = time_info[time_loc].time;
-				time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
-				time_info[(time_loc+1)%ip_list_tot].time = time_temp;
-				time_temp = time_info[time_loc].position;
-				time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
-				time_info[(time_loc+1)%ip_list_tot].position = time_temp;
-				r_list[time_info[time_loc].position].time_pos = time_loc;
-				r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
-				time_loc = (time_loc+1) % ip_list_tot;
-			}
-			r_list[location].time_pos = time_loc;
-			r_list[location].ttl = ttl;
-			r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
-			r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
-			r_list[location].last_seen = now;
-		}
-		/* If we have been asked to remove the entry from the list, just set it to 0 */
-		if(info->check_set & IPT_RECENT_REMOVE) {
-#ifdef DEBUG
-			if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
-#endif
-			/* Check if this is part of a collision chain */
-			while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
-				orig_hash_result++;
-				if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
-					/* Found collision chain, how deep does this rabbit hole go? */
-#ifdef DEBUG
-					if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
-#endif
-					end_collision_chain = orig_hash_result;
-				}
-			}
-			if(end_collision_chain != -1) {
-#ifdef DEBUG
-				if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
-#endif
-				/* Part of a collision chain, swap it with the end of the chain
-				 * before removing. */
-				r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
-				temp = hash_table[end_collision_chain];
-				hash_table[end_collision_chain] = hash_table[hash_result];
-				hash_table[hash_result] = temp;
-				time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-				hash_result = end_collision_chain;
-				r_list[hash_table[hash_result]].hash_entry = hash_result;
-				time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
-			}
-			location = hash_table[hash_result];
-			hash_table[r_list[location].hash_entry] = -1;
-			time_loc = r_list[location].time_pos;
-			time_info[time_loc].time = 0;
-			time_info[time_loc].position = location;
-			while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
-				time_temp = time_info[time_loc].time;
-				time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
-				time_info[(time_loc+1)%ip_list_tot].time = time_temp;
-				time_temp = time_info[time_loc].position;
-				time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
-				time_info[(time_loc+1)%ip_list_tot].position = time_temp;
-				r_list[time_info[time_loc].position].time_pos = time_loc;
-				r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
-				time_loc = (time_loc+1) % ip_list_tot;
-			}
-			r_list[location].time_pos = time_loc;
-			r_list[location].last_seen = 0;
-			r_list[location].addr = 0;
-			r_list[location].ttl = 0;
-			memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
-			r_list[location].oldest_pkt = 0;
-			ans = !info->invert;
-		}
-		spin_unlock_bh(&curr_table->list_lock);
-		return ans;
 	}
 
-	spin_unlock_bh(&curr_table->list_lock);
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
-#endif
-	return ans;
+	if (info->check_set & IPT_RECENT_SET ||
+	    (info->check_set & IPT_RECENT_UPDATE && ret)) {
+		recent_entry_update(t, e);
+		e->ttl = ttl;
+	}
+out:
+	spin_unlock_bh(&recent_lock);
+	return ret;
 }
 
-/* This function is to verify that the rule given during the userspace iptables
- * command is correct.
- * If the command is valid then we check if the table name referred to by the
- * rule exists, if not it is created.
- */
 static int
-checkentry(const char *tablename,
-           const void *ip,
-	   const struct xt_match *match,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
+ipt_recent_checkentry(const char *tablename, const void *ip,
+		      const struct xt_match *match, void *matchinfo,
+		      unsigned int matchsize, unsigned int hook_mask)
 {
-	int flag = 0, c;
-	unsigned long *hold;
 	const struct ipt_recent_info *info = matchinfo;
-	struct recent_ip_tables *curr_table, *find_table, *last_table;
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
-#endif
-
-	/* seconds and hit_count only valid for CHECK/UPDATE */
-	if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
-	if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
-	if(info->check_set & IPT_RECENT_CHECK) flag++;
-	if(info->check_set & IPT_RECENT_UPDATE) flag++;
-
-	/* One and only one of these should ever be set */
-	if(flag != 1) return 0;
-
-	/* Name must be set to something */
-	if(!info->name || !info->name[0]) return 0;
+	struct recent_table *t;
+	unsigned i;
+	int ret = 0;
 
-	/* Things look good, create a list for this if it does not exist */
-	/* Lock the linked list while we play with it */
-	spin_lock_bh(&recent_lock);
-
-	/* Look for an entry with this name already created */
-	/* Finds the end of the list and the entry before the end if current name does not exist */
-	find_table = r_tables;
-	while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+	if (hweight8(info->check_set &
+		     (IPT_RECENT_SET | IPT_RECENT_REMOVE |
+		      IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
+		return 0;
+	if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
+	    (info->seconds || info->hit_count))
+		return 0;
+	if (info->name[0] == '\0' ||
+	    strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
+		return 0;
 
-	/* If a table already exists just increment the count on that table and return */
-	if(find_table) { 
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
-#endif
-		find_table->count++;
-		spin_unlock_bh(&recent_lock);
-		return 1;
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(info->name);
+	if (t != NULL) {
+		t->refcnt++;
+		ret = 1;
+		goto out;
 	}
 
-	spin_unlock_bh(&recent_lock);
-
-	/* Table with this name not found */
-	/* Allocate memory for new linked list item */
-
-#ifdef DEBUG
-	if(debug) {
-		printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
-		printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
+	t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
+		    GFP_KERNEL);
+	if (t == NULL)
+		goto out;
+	t->refcnt = 1;
+	strcpy(t->name, info->name);
+	INIT_LIST_HEAD(&t->lru_list);
+	for (i = 0; i < ip_list_hash_size; i++)
+		INIT_LIST_HEAD(&t->iphash[i]);
+#ifdef CONFIG_PROC_FS
+	t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
+	if (t->proc == NULL) {
+		kfree(t);
+		goto out;
 	}
+	t->proc->proc_fops = &recent_fops;
+	t->proc->data      = t;
 #endif
+	spin_lock_bh(&recent_lock);
+	list_add_tail(&t->list, &tables);
+	spin_unlock_bh(&recent_lock);
+	ret = 1;
+out:
+	mutex_unlock(&recent_mutex);
+	return ret;
+}
 
-	curr_table = vmalloc(sizeof(struct recent_ip_tables));
-	if(curr_table == NULL) return 0;
-
-	spin_lock_init(&curr_table->list_lock);
-	curr_table->next = NULL;
-	curr_table->count = 1;
-	curr_table->time_pos = 0;
-	strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
-	curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
-
-	/* Allocate memory for this table and the list of packets in each entry. */
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
-			sizeof(struct recent_ip_list)*ip_list_tot,
-			info->name);
-#endif
-
-	curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
-	if(curr_table->table == NULL) { vfree(curr_table); return 0; }
-	memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
-			sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
-#endif
-
-	hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
-#endif
-	if(hold == NULL) { 
-		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
-		vfree(curr_table->table); 
-		vfree(curr_table);
-		return 0;
-	}
-	for(c = 0; c < ip_list_tot; c++) {
-		curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
-	}
+static void
+ipt_recent_destroy(const struct xt_match *match, void *matchinfo,
+		   unsigned int matchsize)
+{
+	const struct ipt_recent_info *info = matchinfo;
+	struct recent_table *t;
 
-	/* Allocate memory for the hash table */
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
-			sizeof(int)*ip_list_hash_size);
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(info->name);
+	if (--t->refcnt == 0) {
+		spin_lock_bh(&recent_lock);
+		list_del(&t->list);
+		spin_unlock_bh(&recent_lock);
+		recent_table_flush(t);
+#ifdef CONFIG_PROC_FS
+		remove_proc_entry(t->name, proc_dir);
 #endif
-
-	curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
-	if(!curr_table->hash_table) {
-		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
-		vfree(hold);
-		vfree(curr_table->table); 
-		vfree(curr_table);
-		return 0;
-	}
-
-	for(c = 0; c < ip_list_hash_size; c++) {
-		curr_table->hash_table[c] = -1;
+		kfree(t);
 	}
+	mutex_unlock(&recent_mutex);
+}
 
-	/* Allocate memory for the time info */
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
-			sizeof(struct time_info_list)*ip_list_tot);
-#endif
+#ifdef CONFIG_PROC_FS
+struct recent_iter_state {
+	struct recent_table	*table;
+	unsigned int		bucket;
+};
 
-	curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
-	if(!curr_table->time_info) {
-		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
-		vfree(curr_table->hash_table);
-		vfree(hold);
-		vfree(curr_table->table); 
-		vfree(curr_table);
-		return 0;
-	}
-	for(c = 0; c < ip_list_tot; c++) {
-		curr_table->time_info[c].position = c;
-		curr_table->time_info[c].time = 0;
-	}
+static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct recent_iter_state *st = seq->private;
+	struct recent_table *t = st->table;
+	struct recent_entry *e;
+	loff_t p = *pos;
 
-	/* Put the new table in place */
 	spin_lock_bh(&recent_lock);
-	find_table = r_tables;
-	while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
-
-	/* If a table already exists just increment the count on that table and return */
-	if(find_table) { 
-		find_table->count++;	
-		spin_unlock_bh(&recent_lock);
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
-#endif
-		vfree(curr_table->time_info);
-		vfree(curr_table->hash_table);
-		vfree(hold);
-		vfree(curr_table->table);
-		vfree(curr_table);
-		return 1;
-	}
-	if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
-
-	spin_unlock_bh(&recent_lock);
 
-#ifdef CONFIG_PROC_FS
-	/* Create our proc 'status' entry. */
-	curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
-	if (!curr_table->status_proc) {
-		vfree(hold);
-		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
-		/* Destroy the created table */
-		spin_lock_bh(&recent_lock);
-		last_table = NULL;
-		curr_table = r_tables;
-		if(!curr_table) {
-#ifdef DEBUG
-			if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
-#endif
-			spin_unlock_bh(&recent_lock);
-			return 0;
-		}
-		while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
-		if(!curr_table) {
-#ifdef DEBUG
-			if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
-#endif
-			spin_unlock_bh(&recent_lock);
-			return 0;
+	for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) {
+		list_for_each_entry(e, &t->iphash[st->bucket], list) {
+			if (p-- == 0)
+				return e;
 		}
-		if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
-		spin_unlock_bh(&recent_lock);
-		vfree(curr_table->time_info);
-		vfree(curr_table->hash_table);
-		vfree(curr_table->table);
-		vfree(curr_table);
-		return 0;
 	}
-	
-	curr_table->status_proc->owner = THIS_MODULE;
-	curr_table->status_proc->data = curr_table;
-	wmb();
-	curr_table->status_proc->read_proc = ip_recent_get_info;
-	curr_table->status_proc->write_proc = ip_recent_ctrl;
-#endif /* CONFIG_PROC_FS */
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
-#endif
+	return NULL;
+}
 
-	return 1;
+static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct recent_iter_state *st = seq->private;
+	struct recent_table *t = st->table;
+	struct recent_entry *e = v;
+	struct list_head *head = e->list.next;
+
+	while (head == &t->iphash[st->bucket]) {
+		if (++st->bucket >= ip_list_hash_size)
+			return NULL;
+		head = t->iphash[st->bucket].next;
+	}
+	(*pos)++;
+	return list_entry(head, struct recent_entry, list);
 }
 
-/* This function is called in the event that a rule matching this module is
- * removed.
- * When this happens we need to check if there are no other rules matching
- * the table given.  If that is the case then we remove the table and clean
- * up its memory.
- */
-static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
+static void recent_seq_stop(struct seq_file *s, void *v)
 {
-	const struct ipt_recent_info *info = matchinfo;
-	struct recent_ip_tables *curr_table, *last_table;
+	spin_unlock_bh(&recent_lock);
+}
 
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
-#endif
+static int recent_seq_show(struct seq_file *seq, void *v)
+{
+	struct recent_entry *e = v;
+	unsigned int i;
+
+	i = (e->index - 1) % ip_pkt_list_tot;
+	seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
+		   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
+	for (i = 0; i < e->nstamps; i++)
+		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
+	seq_printf(seq, "\n");
+	return 0;
+}
 
-	if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
+static struct seq_operations recent_seq_ops = {
+	.start		= recent_seq_start,
+	.next		= recent_seq_next,
+	.stop		= recent_seq_stop,
+	.show		= recent_seq_show,
+};
 
-	/* Lock the linked list while we play with it */
-	spin_lock_bh(&recent_lock);
+static int recent_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct seq_file *seq;
+	struct recent_iter_state *st;
+	int ret;
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (st == NULL)
+		return -ENOMEM;
+	ret = seq_open(file, &recent_seq_ops);
+	if (ret)
+		kfree(st);
+	st->table    = pde->data;
+	seq          = file->private_data;
+	seq->private = st;
+	return ret;
+}
 
-	/* Look for an entry with this name already created */
-	/* Finds the end of the list and the entry before the end if current name does not exist */
-	last_table = NULL;
-	curr_table = r_tables;
-	if(!curr_table) { 
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
-#endif
+static ssize_t recent_proc_write(struct file *file, const char __user *input,
+				 size_t size, loff_t *loff)
+{
+	struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
+	struct recent_table *t = pde->data;
+	struct recent_entry *e;
+	char buf[sizeof("+255.255.255.255")], *c = buf;
+	u_int32_t addr;
+	int add;
+
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size))
+		return -EFAULT;
+	while (isspace(*c))
+		c++;
+
+	if (size - (c - buf) < 5)
+		return c - buf;
+	if (!strncmp(c, "clear", 5)) {
+		c += 5;
+		spin_lock_bh(&recent_lock);
+		recent_table_flush(t);
 		spin_unlock_bh(&recent_lock);
-		return;
+		return c - buf;
 	}
-	while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
 
-	/* If a table does not exist then do nothing and return */
-	if(!curr_table) { 
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
-#endif
-		spin_unlock_bh(&recent_lock);
-		return;
+	switch (*c) {
+	case '-':
+		add = 0;
+		c++;
+		break;
+	case '+':
+		c++;
+	default:
+		add = 1;
+		break;
 	}
+	addr = in_aton(c);
 
-	curr_table->count--;
-
-	/* If count is still non-zero then there are still rules referenceing it so we do nothing */
-	if(curr_table->count) { 
-#ifdef DEBUG
-		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
-#endif
-		spin_unlock_bh(&recent_lock);
-		return;
+	spin_lock_bh(&recent_lock);
+	e = recent_entry_lookup(t, addr, 0);
+	if (e == NULL) {
+		if (add)
+			recent_entry_init(t, addr, 0);
+	} else {
+		if (add)
+			recent_entry_update(t, e);
+		else
+			recent_entry_remove(t, e);
 	}
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
-#endif
-
-	/* Count must be zero so we remove this table from the list */
-	if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
-
 	spin_unlock_bh(&recent_lock);
+	return size;
+}
 
-	/* lock to make sure any late-runners still using this after we removed it from
-	 * the list finish up then remove everything */
-	spin_lock_bh(&curr_table->list_lock);
-	spin_unlock_bh(&curr_table->list_lock);
-
-#ifdef CONFIG_PROC_FS
-	if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
+static struct file_operations recent_fops = {
+	.open		= recent_seq_open,
+	.read		= seq_read,
+	.write		= recent_proc_write,
+	.release	= seq_release_private,
+	.owner		= THIS_MODULE,
+};
 #endif /* CONFIG_PROC_FS */
-	vfree(curr_table->table[0].last_pkts);
-	vfree(curr_table->table);
-	vfree(curr_table->hash_table);
-	vfree(curr_table->time_info);
-	vfree(curr_table);
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
-#endif
 
-	return;
-}
-
-/* This is the structure we pass to ipt_register to register our
- * module with iptables.
- */
 static struct ipt_match recent_match = {
 	.name		= "recent",
-	.match		= match,
+	.match		= ipt_recent_match,
 	.matchsize	= sizeof(struct ipt_recent_info),
-	.checkentry	= checkentry,
-	.destroy	= destroy,
-	.me		= THIS_MODULE
+	.checkentry	= ipt_recent_checkentry,
+	.destroy	= ipt_recent_destroy,
+	.me		= THIS_MODULE,
 };
 
-/* Kernel module initialization. */
 static int __init ipt_recent_init(void)
 {
-	int err, count;
+	int err;
 
-	printk(version);
-#ifdef CONFIG_PROC_FS
-	proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
-	if(!proc_net_ipt_recent) return -ENOMEM;
-#endif
-
-	if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
-	  printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
-	  ip_list_hash_size = 0;
-	}
-
-	if(!ip_list_hash_size) {
-		ip_list_hash_size = ip_list_tot*3;
-		count = 2*2;
-		while(ip_list_hash_size > count) count = count*2;
-		ip_list_hash_size = count;
-	}
-
-#ifdef DEBUG
-	if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
-#endif
+	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
+		return -EINVAL;
+	ip_list_hash_size = 1 << fls(ip_list_tot);
 
 	err = ipt_register_match(&recent_match);
+#ifdef CONFIG_PROC_FS
 	if (err)
-		remove_proc_entry("ipt_recent", proc_net);
+		return err;
+	proc_dir = proc_mkdir("ipt_recent", proc_net);
+	if (proc_dir == NULL) {
+		ipt_unregister_match(&recent_match);
+		err = -ENOMEM;
+	}
+#endif
 	return err;
 }
 
-/* Kernel module destruction. */
-static void __exit ipt_recent_fini(void)
+static void __exit ipt_recent_exit(void)
 {
+	BUG_ON(!list_empty(&tables));
 	ipt_unregister_match(&recent_match);
-
-	remove_proc_entry("ipt_recent",proc_net);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("ipt_recent", proc_net);
+#endif
 }
 
-/* Register our module with the kernel. */
 module_init(ipt_recent_init);
-module_exit(ipt_recent_fini);
+module_exit(ipt_recent_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5bc9f64d7b5b..8cc8e1b36778 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum,
 
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(*pskb, &ctinfo);
-	if (!ct)
+	if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
 		return NF_ACCEPT;
 
 	help = nfct_help(ct);
@@ -348,6 +348,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 			.tuple.dst.u.tcp.port;
 		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
 			.tuple.dst.u3.ip;
+		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
 
 		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
 		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e6..663a73ee3f2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc2562415555..bd221ec3f81e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
 }
 
 struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-			     unsigned long raddr, unsigned long laddr,
+			     __be32 raddr, __be32 laddr,
 			     int dif)
 {
 	struct hlist_node *node;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf00..ce4cd5f35511 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = {
 		.strategy	= &ipv4_doint_and_flush_strategy,
 	},
         {
-		.ctl_name	= NET_IPV4_AUTOCONFIG,
-		.procname	= "ip_autoconfig",
-		.data		= &ipv4_config.autoconfig,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-        {
 		.ctl_name	= NET_IPV4_NO_PMTU_DISC,
 		.procname	= "ip_no_pmtu_disc",
 		.data		= &ipv4_config.no_pmtu_disc,
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_NET_DMA
+	{
+		.ctl_name	= NET_TCP_DMA_COPYBREAK,
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
+	{
+		.ctl_name	= NET_TCP_SLOW_START_AFTER_IDLE,
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b8055037..74998f250071 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
-
+#include <net/netdma.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 	ssize_t res;
 	struct sock *sk = sock->sk;
 
-#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
-	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 		return sock_no_sendpage(sock, page, offset, size, flags);
 
-#undef TCP_ZC_CSUM_FLAGS
-
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
 				/*
 				 * Check whether we can use HW checksum.
 				 */
-				if (sk->sk_route_caps &
-				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
-				     NETIF_F_HW_CSUM))
+				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				break;
 		}
 		if (skb->h.th->fin) {
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb);
+		sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
 	}
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 	/* Clean up data we have read: This will do ACK frames. */
 	if (copied)
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 	return copied;
 }
 
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
+	int copied_early = 0;
 
 	lock_sock(sk);
 
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	preempt_disable();
+	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
+	    !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
+		preempt_enable_no_resched();
+		tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
+	} else
+		preempt_enable_no_resched();
+#endif
+
 	do {
 		struct sk_buff *skb;
 		u32 offset;
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			}
 		}
 
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 
 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		} else
 			sk_wait_data(sk, &timeo);
 
+#ifdef CONFIG_NET_DMA
+		tp->ucopy.wakeup = 0;
+#endif
+
 		if (user_recv) {
 			int chunk;
 
@@ -1329,13 +1339,39 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
-			if (err) {
-				/* Exception. Bailout! */
-				if (!copied)
-					copied = -EFAULT;
-				break;
+#ifdef CONFIG_NET_DMA
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+
+			if (tp->ucopy.dma_chan) {
+				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+					tp->ucopy.dma_chan, skb, offset,
+					msg->msg_iov, used,
+					tp->ucopy.pinned_list);
+
+				if (tp->ucopy.dma_cookie < 0) {
+
+					printk(KERN_ALERT "dma_cookie < 0\n");
+
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+				if ((offset + used) == skb->len)
+					copied_early = 1;
+
+			} else
+#endif
+			{
+				err = skb_copy_datagram_iovec(skb, offset,
+						msg->msg_iov, used);
+				if (err) {
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
 			}
 		}
 
@@ -1355,15 +1391,19 @@ skip_copy:
 
 		if (skb->h.th->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, copied_early);
+			copied_early = 0;
+		}
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, copied_early);
+			copied_early = 0;
+		}
 		break;
 	} while (len > 0);
 
@@ -1386,12 +1426,42 @@ skip_copy:
 		tp->ucopy.len = 0;
 	}
 
+#ifdef CONFIG_NET_DMA
+	if (tp->ucopy.dma_chan) {
+		struct sk_buff *skb;
+		dma_cookie_t done, used;
+
+		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+		                                 tp->ucopy.dma_cookie, &done,
+		                                 &used) == DMA_IN_PROGRESS) {
+			/* do partial cleanup of sk_async_wait_queue */
+			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+			       (dma_async_is_complete(skb->dma_cookie, done,
+			                              used) == DMA_SUCCESS)) {
+				__skb_dequeue(&sk->sk_async_wait_queue);
+				kfree_skb(skb);
+			}
+		}
+
+		/* Safe to free early-copied skbs now */
+		__skb_queue_purge(&sk->sk_async_wait_queue);
+		dma_chan_put(tp->ucopy.dma_chan);
+		tp->ucopy.dma_chan = NULL;
+	}
+	if (tp->ucopy.pinned_list) {
+		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
+		tp->ucopy.pinned_list = NULL;
+	}
+#endif
+
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
-	cleanup_rbuf(sk, copied);
+	tcp_cleanup_rbuf(sk, copied);
 
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags)
 	__skb_queue_purge(&sk->sk_receive_queue);
 	sk_stream_writequeue_purge(sk);
 	__skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
 
 	inet->dport = 0;
 
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
 			    inet_csk_ack_scheduled(sk)) {
 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				cleanup_rbuf(sk, 1);
+				tcp_cleanup_rbuf(sk, 1);
 				if (!(val & 1))
 					icsk->icsk_ack.pingpong = 1;
 			}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73a..b2d9021ad22b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
new file mode 100644
index 000000000000..bc54f7e9aea9
--- /dev/null
+++ b/net/ipv4/tcp_compound.c
@@ -0,0 +1,448 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ *
+ *
+ *   TCP Compound based on TCP Vegas
+ *
+ *   further details can be found here:
+ *      ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+
+#define TCP_COMPOUND_ALPHA          3U
+#define TCP_COMPOUND_BETA           1U
+#define TCP_COMPOUND_GAMMA         30
+#define TCP_COMPOUND_ZETA           1
+
+/* TCP compound variables */
+struct compound {
+	u32 beg_snd_nxt;	/* right edge during last RTT */
+	u32 beg_snd_una;	/* left edge  during last RTT */
+	u32 beg_snd_cwnd;	/* saves the size of the cwnd */
+	u8 doing_vegas_now;	/* if true, do vegas for this RTT */
+	u16 cntRTT;		/* # of RTTs measured within last RTT */
+	u32 minRTT;		/* min of RTTs measured within last RTT (in usec) */
+	u32 baseRTT;		/* the min of all Vegas RTT measurements seen (in usec) */
+
+	u32 cwnd;
+	u32 dwnd;
+};
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct compound *vegas = inet_csk_ca(sk);
+
+	/* Begin taking Vegas samples next time we send something. */
+	vegas->doing_vegas_now = 1;
+
+	/* Set the beginning of the next send window. */
+	vegas->beg_snd_nxt = tp->snd_nxt;
+
+	vegas->cntRTT = 0;
+	vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+	struct compound *vegas = inet_csk_ca(sk);
+
+	vegas->doing_vegas_now = 0;
+}
+
+static void tcp_compound_init(struct sock *sk)
+{
+	struct compound *vegas = inet_csk_ca(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	vegas->baseRTT = 0x7fffffff;
+	vegas_enable(sk);
+
+	vegas->dwnd = 0;
+	vegas->cwnd = tp->snd_cwnd;
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt)
+{
+	struct compound *vegas = inet_csk_ca(sk);
+	u32 vrtt = usrtt + 1;	/* Never allow zero rtt or baseRTT */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < vegas->baseRTT)
+		vegas->baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+
+	vegas->minRTT = min(vegas->minRTT, vrtt);
+	vegas->cntRTT++;
+}
+
+static void tcp_compound_state(struct sock *sk, u8 ca_state)
+{
+
+	if (ca_state == TCP_CA_Open)
+		vegas_enable(sk);
+	else
+		vegas_disable(sk);
+}
+
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u64 div64_64(u64 dividend, u64 divisor)
+{
+	u32 d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	/* avoid 64 bit division if possible */
+	if (dividend >> 32)
+		do_div(dividend, d);
+	else
+		dividend = (u32) dividend / d;
+
+	return dividend;
+}
+
+/* calculate the quartic root of "a" using Newton-Raphson */
+static u32 qroot(u64 a)
+{
+	u32 x, x1;
+
+	/* Initial estimate is based on:
+	 * qrt(x) = exp(log(x) / 4)
+	 */
+	x = 1u << (fls64(a) >> 2);
+
+	/*
+	 * Iteration based on:
+	 *                         3
+	 * x    = ( 3 * x  +  a / x  ) / 4
+	 *  k+1          k         k
+	 */
+	do {
+		u64 x3 = x;
+
+		x1 = x;
+		x3 *= x;
+		x3 *= x;
+
+		x = (3 * x + (u32) div64_64(a, x3)) / 4;
+	} while (abs(x1 - x) > 1);
+
+	return x;
+}
+
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+		tcp_compound_init(sk);
+}
+
+static void tcp_compound_cong_avoid(struct sock *sk, u32 ack,
+				    u32 seq_rtt, u32 in_flight, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct compound *vegas = inet_csk_ca(sk);
+	u8 inc = 0;
+
+	if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) {
+		if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) {
+			vegas->cwnd = tp->snd_cwnd;
+			vegas->dwnd = 0;
+		} else
+			vegas->cwnd = tp->snd_cwnd - vegas->dwnd;
+
+	}
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (vegas->cwnd <= tp->snd_ssthresh)
+		inc = 1;
+	else if (tp->snd_cwnd_cnt < tp->snd_cwnd)
+		tp->snd_cwnd_cnt++;
+
+	if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+		inc = 1;
+		tp->snd_cwnd_cnt = 0;
+	}
+
+	if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp)
+		vegas->cwnd++;
+
+	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up nicely with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, vegas->beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+		u32 old_wnd, old_snd_cwnd;
+
+		/* Here old_wnd is essentially the window of data that was
+		 * sent during the previous RTT, and has all
+		 * been acknowledged in the course of the RTT that ended
+		 * with the ACK we just received. Likewise, old_snd_cwnd
+		 * is the cwnd during the previous RTT.
+		 */
+		if (!tp->mss_cache)
+			return;
+
+		old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+		    tp->mss_cache;
+		old_snd_cwnd = vegas->beg_snd_cwnd;
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		vegas->beg_snd_una = vegas->beg_snd_nxt;
+		vegas->beg_snd_nxt = tp->snd_nxt;
+		vegas->beg_snd_cwnd = tp->snd_cwnd;
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (vegas->cntRTT > 2) {
+			u32 rtt, target_cwnd, diff;
+			u32 brtt, dwnd;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = vegas->minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 * We keep it as a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary point.
+			 */
+			if (!rtt)
+				return;
+
+			brtt = vegas->baseRTT;
+			target_cwnd = ((old_wnd * brtt)
+				       << V_PARAM_SHIFT) / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 *
+			 * Again, this is a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary
+			 * point.
+			 */
+
+			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+			dwnd = vegas->dwnd;
+
+			if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) {
+				u64 v;
+				u32 x;
+
+				/*
+				 * The TCP Compound paper describes the choice
+				 * of "k" determines the agressiveness,
+				 * ie. slope of the response function.
+				 *
+				 * For same value as HSTCP would be 0.8
+				 * but for computaional reasons, both the
+				 * original authors and this implementation
+				 * use 0.75.
+				 */
+				v = old_wnd;
+				x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA;
+				if (x > 1)
+					dwnd = x - 1;
+				else
+					dwnd = 0;
+
+				dwnd += vegas->dwnd;
+
+			} else if ((dwnd << V_PARAM_SHIFT) <
+				   (diff * TCP_COMPOUND_BETA))
+				dwnd = 0;
+			else
+				dwnd =
+				    ((dwnd << V_PARAM_SHIFT) -
+				     (diff *
+				      TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT;
+
+			vegas->dwnd = dwnd;
+
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
+
+	tp->snd_cwnd = vegas->cwnd + vegas->dwnd;
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+	const struct compound *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcpvegas_info *info;
+
+		info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
+					  sizeof(*info)));
+
+		info->tcpv_enabled = ca->doing_vegas_now;
+		info->tcpv_rttcnt = ca->cntRTT;
+		info->tcpv_rtt = ca->baseRTT;
+		info->tcpv_minrtt = ca->minRTT;
+	rtattr_failure:;
+	}
+}
+
+static struct tcp_congestion_ops tcp_compound = {
+	.init		= tcp_compound_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_compound_cong_avoid,
+	.rtt_sample	= tcp_compound_rtt_calc,
+	.set_state	= tcp_compound_state,
+	.cwnd_event	= tcp_compound_cwnd_event,
+	.get_info	= tcp_compound_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "compound",
+};
+
+static int __init tcp_compound_register(void)
+{
+	BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_compound);
+	return 0;
+}
+
+static void __exit tcp_compound_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_compound);
+}
+
+module_init(tcp_compound_register);
+module_exit(tcp_compound_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Compound");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f58..857eefc52aab 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 	int ret = 0;
 
 	/* all algorithms must implement ssthresh and cong_avoid ops */
-	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+	if (!ca->ssthresh || !ca->cong_avoid) {
 		printk(KERN_ERR "TCP %s does not implement required ops\n",
 		       ca->name);
 		return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
-/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct sock *sk)
+/* Lower bound on congestion window with halving. */
+u32 tcp_reno_min_cwnd(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf7..78b7a6b9e4de 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	return tcp_sk(sk)->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "cubic",
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index b72fa55dfb84..1120245b2373 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,6 +98,10 @@ struct hstcp {
 	u32	ai;
 };
 
+static int max_ssthresh = 100;
+module_param(max_ssthresh, int, 0644);
+MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)");
+
 static void hstcp_init(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
 	if (!tcp_is_cwnd_limited(sk, in_flight))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
-	else {
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		/* RFC3742: limited slow start
+		 * the window is increased by 1/K MSS for each arriving ACK,
+		 * for K = int(cwnd/(0.5 max_ssthresh))
+		 */
+		if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) {
+			u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U);
+			if (++tp->snd_cwnd_cnt >= k) {
+				if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+					tp->snd_cwnd++;
+				tp->snd_cwnd_cnt = 0;
+			}
+		} else {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+		}
+	} else {
 		/* Update AIMD parameters */
 		if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
 			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
@@ -135,7 +153,8 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
 
 		/* Do additive increase */
 		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
-			tp->snd_cwnd_cnt += ca->ai;
+			/* cwnd = cwnd + a(w) / cwnd */
+			tp->snd_cwnd_cnt += ca->ai + 1;
 			if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 				tp->snd_cwnd_cnt -= tp->snd_cwnd;
 				tp->snd_cwnd++;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98ed..3d92c1859267 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 	}
 }
 
-/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
-
 static void htcp_init(struct sock *sk)
 {
 	struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
 static struct tcp_congestion_ops htcp = {
 	.init		= htcp_init,
 	.ssthresh	= htcp_recalc_ssthresh,
-	.min_cwnd	= htcp_min_cwnd,
 	.cong_avoid	= htcp_cong_avoid,
 	.set_state	= htcp_state,
 	.undo_cwnd	= htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4a538bc1683d..e08245bdda3a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,7 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
+#include <net/netdma.h>
 
 int sysctl_tcp_timestamps = 1;
 int sysctl_tcp_window_scaling = 1;
@@ -1649,7 +1650,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 	 * Hence, we can detect timed out packets during fast
 	 * retransmit without falling to slow start.
 	 */
-	if (tcp_head_timedout(sk, tp)) {
+	if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
 		struct sk_buff *skb;
 
 		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -1662,8 +1663,6 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
 				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 				tp->lost_out += tcp_skb_pcount(skb);
-				if (IsReno(tp))
-					tcp_remove_reno_sacks(sk, tp, tcp_skb_pcount(skb) + 1);
 
 				/* clear xmit_retrans hint */
 				if (tp->retransmit_skb_hint &&
@@ -1690,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
+	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int decr = tp->snd_cwnd_cnt + 1;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
+	if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -3787,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
 		__tcp_checksum_complete_user(sk, skb);
 }
 
+#ifdef CONFIG_NET_DMA
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int chunk = skb->len - hlen;
+	int dma_cookie;
+	int copied_early = 0;
+
+	if (tp->ucopy.wakeup)
+          	return 0;
+
+	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+		tp->ucopy.dma_chan = get_softnet_dma();
+
+	if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+
+		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+			skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+
+		if (dma_cookie < 0)
+			goto out;
+
+		tp->ucopy.dma_cookie = dma_cookie;
+		copied_early = 1;
+
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		tcp_rcv_space_adjust(sk);
+
+		if ((tp->ucopy.len == 0) ||
+		    (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+			tp->ucopy.wakeup = 1;
+			sk->sk_data_ready(sk, 0);
+		}
+	} else if (chunk > 0) {
+		tp->ucopy.wakeup = 1;
+		sk->sk_data_ready(sk, 0);
+	}
+out:
+	return copied_early;
+}
+#endif /* CONFIG_NET_DMA */
+
 /*
  *	TCP receive function for the ESTABLISHED state. 
  *
@@ -3888,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
-				tcp_rcv_rtt_measure_ts(sk, skb);
-
 				/* We know that such packets are checksummed
 				 * on entry.
 				 */
@@ -3903,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			}
 		} else {
 			int eaten = 0;
+			int copied_early = 0;
 
-			if (tp->ucopy.task == current &&
-			    tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len &&
-			    sock_owned_by_user(sk)) {
-				__set_current_state(TASK_RUNNING);
+			if (tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len) {
+#ifdef CONFIG_NET_DMA
+				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+					copied_early = 1;
+					eaten = 1;
+				}
+#endif
+				if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+					__set_current_state(TASK_RUNNING);
 
-				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+					if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+						eaten = 1;
+				}
+				if (eaten) {
 					/* Predicted packet is in window by definition.
 					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 					 * Hence, check seq<=rcv_wup reduces to:
@@ -3926,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					__skb_pull(skb, tcp_header_len);
 					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 					NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
-					eaten = 1;
 				}
+				if (copied_early)
+					tcp_cleanup_rbuf(sk, skb->len);
 			}
 			if (!eaten) {
 				if (tcp_checksum_complete_user(sk, skb))
@@ -3968,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 			__tcp_ack_snd_check(sk, 0);
 no_ack:
+#ifdef CONFIG_NET_DMA
+			if (copied_early)
+				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+			else
+#endif
 			if (eaten)
 				__kfree_skb(skb);
 			else
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e54c49..25ecc6e2478b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,7 @@
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
+#include <net/netdma.h>
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -1091,8 +1092,18 @@ process:
 	bh_lock_sock(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+		if (tp->ucopy.dma_chan)
 			ret = tcp_v4_do_rcv(sk, skb);
+		else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
+			ret = tcp_v4_do_rcv(sk, skb);
+		}
 	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
   	__skb_queue_purge(&tp->out_of_order_queue);
 
+#ifdef CONFIG_NET_DMA
+	/* Cleans up our sk_async_wait_queue */
+  	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
+
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 000000000000..1f977b6ee9a1
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,338 @@
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ *   the excess network bandwidth as compared to the ``fair share`` of
+ *   bandwidth as targeted by TCP. Available from:
+ *     http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ *
+ * Original Author:
+ *   Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ *
+ * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation.
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ *   o We use newReno in most core CA handling. Only add some checking
+ *     within cong_avoid.
+ *   o Error correcting in remote HZ, therefore remote HZ will be keeped
+ *     on checking and updating.
+ *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
+ *     OWD have a similar meaning as RTT. Also correct the buggy formular.
+ *   o Handle reaction for Early Congestion Indication (ECI) within
+ *     pkts_acked, as mentioned within pseudo code.
+ *   o OWD is handled in relative format, where local time stamp will in
+ *     tcp_time_stamp format.
+ *
+ * Port from 2.4.19 to 2.6.16 as module by:
+ *   Wong Hoi Sing Edison <hswong3i@gmail.com>
+ *   Hung Hing Lun <hlhung3i@gmail.com>
+ *
+ * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* resolution of owd */
+#define LP_RESOL       1000
+
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+	LP_VALID_RHZ = (1 << 0),
+	LP_VALID_OWD = (1 << 1),
+	LP_WITHIN_THR = (1 << 3),
+	LP_WITHIN_INF = (1 << 4),
+};
+
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: resrved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+	u32 flag;
+	u32 sowd;
+	u32 owd_min;
+	u32 owd_max;
+	u32 owd_max_rsv;
+	u32 remote_hz;
+	u32 remote_ref_time;
+	u32 local_ref_time;
+	u32 last_drop;
+	u32 inference;
+};
+
+/**
+ * tcp_lp_init
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	lp->flag = 0;
+	lp->sowd = 0;
+	lp->owd_min = 0xffffffff;
+	lp->owd_max = 0;
+	lp->owd_max_rsv = 0;
+	lp->remote_hz = 0;
+	lp->remote_ref_time = 0;
+	lp->local_ref_time = 0;
+	lp->last_drop = 0;
+	lp->inference = 0;
+}
+
+/**
+ * tcp_lp_cong_avoid
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
+			      int flag)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	if (!(lp->flag & LP_WITHIN_INF))
+		tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
+}
+
+/**
+ * tcp_lp_remote_hz_estimator
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 rhz = lp->remote_hz << 6;	/* remote HZ << 6 */
+	s64 m = 0;
+
+	/* not yet record reference time
+	 * go away!! record it before come back!! */
+	if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+		goto out;
+
+	/* we can't calc remote HZ with no different!! */
+	if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
+	    || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+		goto out;
+
+	m = HZ * (tp->rx_opt.rcv_tsval -
+		  lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
+					  lp->local_ref_time);
+	if (m < 0)
+		m = -m;
+
+	if (rhz != 0) {
+		m -= rhz >> 6;	/* m is now error in remote HZ est */
+		rhz += m;	/* 63/64 old + 1/64 new */
+	} else
+		rhz = m << 6;
+
+	/* record time for successful remote HZ calc */
+	lp->flag |= LP_VALID_RHZ;
+
+ out:
+	/* record reference time stamp */
+	lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+	lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+
+	return rhz >> 6;
+}
+
+/**
+ * tcp_lp_owd_calculator
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 owd = 0;
+
+	lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+
+	if (lp->flag & LP_VALID_RHZ) {
+		owd =
+		    tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+		    tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+		if (owd < 0)
+			owd = -owd;
+	}
+
+	if (owd > 0)
+		lp->flag |= LP_VALID_OWD;
+	else
+		lp->flag &= ~LP_VALID_OWD;
+
+	return owd;
+}
+
+/**
+ * tcp_lp_rtt_sample
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ *   1. calc OWD,
+ *   2. record the min/max OWD,
+ *   3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
+{
+	struct lp *lp = inet_csk_ca(sk);
+	s64 mowd = tcp_lp_owd_calculator(sk);
+
+	/* sorry that we don't have valid data */
+	if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+		return;
+
+	/* record the next min owd */
+	if (mowd < lp->owd_min)
+		lp->owd_min = mowd;
+
+	/* always forget the max of the max
+	 * we just set owd_max as one below it */
+	if (mowd > lp->owd_max) {
+		if (mowd > lp->owd_max_rsv) {
+			if (lp->owd_max_rsv == 0)
+				lp->owd_max = mowd;
+			else
+				lp->owd_max = lp->owd_max_rsv;
+			lp->owd_max_rsv = mowd;
+		} else
+			lp->owd_max = mowd;
+	}
+
+	/* calc for smoothed owd */
+	if (lp->sowd != 0) {
+		mowd -= lp->sowd >> 3;	/* m is now error in owd est */
+		lp->sowd += mowd;	/* owd = 7/8 owd + 1/8 new */
+	} else
+		lp->sowd = mowd << 3;	/* take the measured time be owd */
+}
+
+/**
+ * tcp_lp_pkts_acked
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+
+	/* calc inference */
+	if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
+		lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+	/* test if within inference */
+	if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+		lp->flag |= LP_WITHIN_INF;
+	else
+		lp->flag &= ~LP_WITHIN_INF;
+
+	/* test if within threshold */
+	if (lp->sowd >> 3 <
+	    lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+		lp->flag |= LP_WITHIN_THR;
+	else
+		lp->flag &= ~LP_WITHIN_THR;
+
+	pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+		 tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+		 lp->sowd >> 3);
+
+	if (lp->flag & LP_WITHIN_THR)
+		return;
+
+	/* FIXME: try to reset owd_min and owd_max here
+	 * so decrease the chance the min/max is no longer suitable
+	 * and will usually within threshold when whithin inference */
+	lp->owd_min = lp->sowd >> 3;
+	lp->owd_max = lp->sowd >> 2;
+	lp->owd_max_rsv = lp->sowd >> 2;
+
+	/* happened within inference
+	 * drop snd_cwnd into 1 */
+	if (lp->flag & LP_WITHIN_INF)
+		tp->snd_cwnd = 1U;
+
+	/* happened after inference
+	 * cut snd_cwnd into half */
+	else
+		tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+
+	/* record this drop time */
+	lp->last_drop = tcp_time_stamp;
+}
+
+static struct tcp_congestion_ops tcp_lp = {
+	.init = tcp_lp_init,
+	.ssthresh = tcp_reno_ssthresh,
+	.cong_avoid = tcp_lp_cong_avoid,
+	.min_cwnd = tcp_reno_min_cwnd,
+	.rtt_sample = tcp_lp_rtt_sample,
+	.pkts_acked = tcp_lp_pkts_acked,
+
+	.owner = THIS_MODULE,
+	.name = "lp"
+};
+
+static int __init tcp_lp_register(void)
+{
+	BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_lp);
+}
+
+static void __exit tcp_lp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_lp);
+}
+
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 743016baa048..07bb5a2b375e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
 int sysctl_tcp_mtu_probing = 0;
 int sysctl_tcp_base_mss = 512;
 
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle = 1;
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+	if (sysctl_tcp_slow_start_after_idle &&
+	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
 		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
@@ -642,7 +646,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
  * eventually). The difference is that pulled data not copied, but
  * immediately discarded.
  */
-static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+static void __pskb_trim_head(struct sk_buff *skb, int len)
 {
 	int i, k, eat;
 
@@ -667,7 +671,6 @@ static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
 	skb->tail = skb->data;
 	skb->data_len -= len;
 	skb->len = skb->data_len;
-	return skb->tail;
 }
 
 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
@@ -676,12 +679,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 		return -ENOMEM;
 
-	if (len <= skb_headlen(skb)) {
+	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
+	if (unlikely(len < skb_headlen(skb)))
 		__skb_pull(skb, len);
-	} else {
-		if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
-			return -ENOMEM;
-	}
+	else
+		__pskb_trim_head(skb, len - skb_headlen(skb));
 
 	TCP_SKB_CB(skb)->seq += len;
 	skb->ip_summed = CHECKSUM_HW;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 000000000000..d7d517a3a238
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,181 @@
+/*
+ * tcpprobe - Observe the TCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+
+#include <net/tcp.h>
+
+MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
+MODULE_DESCRIPTION("TCP cwnd snooper");
+MODULE_LICENSE("GPL");
+
+static int port = 0;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static int bufsize = 64*1024;
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+static const char procname[] = "tcpprobe";
+
+struct {
+	struct kfifo  *fifo;
+	spinlock_t    lock;
+	wait_queue_head_t wait;
+	struct timeval tstart;
+} tcpw;
+
+static void printl(const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	struct timeval now;
+	char tbuf[256];
+
+	va_start(args, fmt);
+	do_gettimeofday(&now);
+
+	now.tv_sec -= tcpw.tstart.tv_sec;
+	now.tv_usec -= tcpw.tstart.tv_usec;
+	if (now.tv_usec < 0) {
+		--now.tv_sec;
+		now.tv_usec += 1000000;
+	}
+
+	len = sprintf(tbuf, "%lu.%06lu ",
+		      (unsigned long) now.tv_sec,
+		      (unsigned long) now.tv_usec);
+	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+	va_end(args);
+
+	kfifo_put(tcpw.fifo, tbuf, len);
+	wake_up(&tcpw.wait);
+}
+
+static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t size)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+
+	if (port == 0 || ntohs(inet->dport) == port ||
+	    ntohs(inet->sport) == port) {
+		printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
+		       NIPQUAD(inet->saddr), ntohs(inet->sport),
+		       NIPQUAD(inet->daddr), ntohs(inet->dport),
+		       size, tp->snd_nxt, tp->snd_una,
+		       tp->snd_cwnd, tcp_current_ssthresh(sk),
+		       tp->snd_wnd);
+	}
+
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe tcp_send_probe = {
+	.kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, },
+	.entry = (kprobe_opcode_t *) &jtcp_sendmsg,
+};
+
+
+static int tcpprobe_open(struct inode * inode, struct file * file)
+{
+	kfifo_reset(tcpw.fifo);
+	do_gettimeofday(&tcpw.tstart);
+	return 0;
+}
+
+static ssize_t tcpprobe_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos)
+{
+	int error = 0, cnt;
+	unsigned char *tbuf;
+
+	if (!buf || len < 0)
+		return -EINVAL;
+
+	if (len == 0)
+		return 0;
+
+	tbuf = vmalloc(len);
+	if (!tbuf)
+		return -ENOMEM;
+
+	error = wait_event_interruptible(tcpw.wait,
+					 __kfifo_len(tcpw.fifo) != 0);
+	if (error)
+		return error;
+
+	cnt = kfifo_get(tcpw.fifo, tbuf, len);
+	error = copy_to_user(buf, tbuf, cnt);
+
+	vfree(tbuf);
+
+	return error ? error : cnt;
+}
+
+static struct file_operations tcpprobe_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = tcpprobe_open,
+	.read    = tcpprobe_read,
+};
+
+static __init int tcpprobe_init(void)
+{
+	int ret = -ENOMEM;
+
+	init_waitqueue_head(&tcpw.wait);
+	spin_lock_init(&tcpw.lock);
+	tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock);
+
+	if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
+		goto err0;
+
+	ret = register_jprobe(&tcp_send_probe);
+	if (ret)
+		goto err1;
+
+	pr_info("TCP watch registered (port=%d)\n", port);
+	return 0;
+ err1:
+	proc_net_remove(procname);
+ err0:
+	kfifo_free(tcpw.fifo);
+	return ret;
+}
+module_init(tcpprobe_init);
+
+static __exit void tcpprobe_exit(void)
+{
+	kfifo_free(tcpw.fifo);
+	proc_net_remove(procname);
+	unregister_jprobe(&tcp_send_probe);
+
+}
+module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 000000000000..11b42a7135c1
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,231 @@
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    C. P. Fu, S. C. Liew.
+ *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ *    IEEE Journal on Selected Areas in Communication,
+ *    Feb. 2003.
+ * 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+
+/* Veno variables */
+struct veno {
+	u8 doing_veno_now;	/* if true, do veno for this rtt */
+	u16 cntrtt;		/* # of rtts measured within last rtt */
+	u32 minrtt;		/* min of rtts measured within last rtt (in usec) */
+	u32 basertt;		/* the min of all Veno rtt measurements seen (in usec) */
+	u32 inc;		/* decide whether to increase cwnd */
+	u32 diff;		/* calculate the diff rate */
+};
+
+/* There are several situations when we must "re-start" Veno:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn on Veno */
+	veno->doing_veno_now = 1;
+
+	veno->minrtt = 0x7fffffff;
+}
+
+static inline void veno_disable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn off Veno */
+	veno->doing_veno_now = 0;
+}
+
+static void tcp_veno_init(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	veno->basertt = 0x7fffffff;
+	veno->inc = 1;
+	veno_enable(sk);
+}
+
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt)
+{
+	struct veno *veno = inet_csk_ca(sk);
+	u32 vrtt = usrtt + 1;	/* Never allow zero rtt or basertt */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < veno->basertt)
+		veno->basertt = vrtt;
+
+	/* Find the min rtt during the last rtt to find
+	 * the current prop. delay + queuing delay:
+	 */
+	veno->minrtt = min(veno->minrtt, vrtt);
+	veno->cntrtt++;
+}
+
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+	if (ca_state == TCP_CA_Open)
+		veno_enable(sk);
+	else
+		veno_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples.  So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+		tcp_veno_init(sk);
+}
+
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
+				u32 seq_rtt, u32 in_flight, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (!veno->doing_veno_now)
+		return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+
+	/* limited by applications */
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	/* We do the Veno calculations only if we got enough rtt samples */
+	if (veno->cntrtt <= 2) {
+		/* We don't have enough rtt samples to do the Veno
+		 * calculation, so we'll behave like Reno.
+		 */
+		tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+	} else {
+		u32 rtt, target_cwnd;
+
+		/* We have enough rtt samples, so, using the Veno
+		 * algorithm, we determine the state of the network.
+		 */
+
+		rtt = veno->minrtt;
+
+		target_cwnd = ((tp->snd_cwnd * veno->basertt)
+			       << V_PARAM_SHIFT) / rtt;
+
+		veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+
+		if (tp->snd_cwnd <= tp->snd_ssthresh) {
+			/* Slow start.  */
+			tcp_slow_start(tp);
+		} else {
+			/* Congestion avoidance. */
+			if (veno->diff < beta) {
+				/* In the "non-congestive state", increase cwnd
+				 *  every rtt.
+				 */
+				if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+					if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+						tp->snd_cwnd++;
+					tp->snd_cwnd_cnt = 0;
+				} else
+					tp->snd_cwnd_cnt++;
+			} else {
+				/* In the "congestive state", increase cwnd
+				 * every other rtt.
+				 */
+				if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+					if (veno->inc
+					    && tp->snd_cwnd <
+					    tp->snd_cwnd_clamp) {
+						tp->snd_cwnd++;
+						veno->inc = 0;
+					} else
+						veno->inc = 1;
+					tp->snd_cwnd_cnt = 0;
+				} else
+					tp->snd_cwnd_cnt++;
+			}
+
+		}
+		if (tp->snd_cwnd < 2)
+			tp->snd_cwnd = 2;
+		else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+			tp->snd_cwnd = tp->snd_cwnd_clamp;
+	}
+	/* Wipe the slate clean for the next rtt. */
+	/* veno->cntrtt = 0; */
+	veno->minrtt = 0x7fffffff;
+}
+
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (veno->diff < beta)
+		/* in "non-congestive state", cut cwnd by 1/5 */
+		return max(tp->snd_cwnd * 4 / 5, 2U);
+	else
+		/* in "congestive state", cut cwnd by 1/2 */
+		return max(tp->snd_cwnd >> 1U, 2U);
+}
+
+static struct tcp_congestion_ops tcp_veno = {
+	.init		= tcp_veno_init,
+	.ssthresh	= tcp_veno_ssthresh,
+	.cong_avoid	= tcp_veno_cong_avoid,
+	.rtt_sample	= tcp_veno_rtt_calc,
+	.set_state	= tcp_veno_state,
+	.cwnd_event	= tcp_veno_cwnd_event,
+
+	.owner		= THIS_MODULE,
+	.name		= "veno",
+};
+
+static int __init tcp_veno_register(void)
+{
+	BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_veno);
+	return 0;
+}
+
+static void __exit tcp_veno_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_veno);
+}
+
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c2..4247da1384bf 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,7 +1,24 @@
 /*
- * TCP Westwood+
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
  *
- *	Angelo Dell'Aera:	TCP Westwood+ support
+ *      Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ *   "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ *   "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ *     Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ *   "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ *    A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
  */
 
 #include <linux/config.h>
@@ -22,6 +39,8 @@ struct westwood {
 	u32    accounted;
 	u32    rtt;
 	u32    rtt_min;          /* minimum observed RTT */
+	u8     first_ack;        /* flag which infers that this is the first ack */
+	u8     reset_rtt_min;    /* Reset RTT min to next RTT sample*/
 };
 
 
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk)
         w->bw_est = 0;
         w->accounted = 0;
         w->cumul_ack = 0;
+	w->reset_rtt_min = 1;
 	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
 	w->rtt_win_sx = tcp_time_stamp;
 	w->snd_una = tcp_sk(sk)->snd_una;
+	w->first_ack = 1;
 }
 
 /*
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b)
 	return (((7 * a) + b) >> 3);
 }
 
-static inline void westwood_filter(struct westwood *w, u32 delta)
+static void westwood_filter(struct westwood *w, u32 delta)
 {
-	w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
-	w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+	/* If the filter is empty fill it with the first sample of bandwidth  */
+	if (w->bw_ns_est == 0 && w->bw_est == 0) {
+		w->bw_ns_est = w->bk / delta;
+		w->bw_est = w->bw_ns_est;
+	} else {
+		w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+		w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+	}
 }
 
 /*
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk)
 	struct westwood *w = inet_csk_ca(sk);
 	s32 delta = tcp_time_stamp - w->rtt_win_sx;
 
+	/* Initialize w->snd_una with the first acked sequence number in order
+	 * to fix mismatch between tp->snd_una and w->snd_una for the first
+	 * bandwidth sample
+	 */
+        if (w->first_ack) {
+		w->snd_una = tcp_sk(sk)->snd_una;
+		w->first_ack = 0;
+	}
+
 	/*
 	 * See if a RTT-window has passed.
 	 * Be careful since if RTT is less than
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk)
 	}
 }
 
+static inline void update_rtt_min(struct westwood *w)
+{
+	if (w->reset_rtt_min) {
+		w->rtt_min = w->rtt;
+		w->reset_rtt_min = 0;	
+	} else
+		w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+
 /*
  * @westwood_fast_bw
  * It is called when we are in fast path. In particular it is called when
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk)
 
 	w->bk += tp->snd_una - w->snd_una;
 	w->snd_una = tp->snd_una;
-	w->rtt_min = min(w->rtt, w->rtt_min);
+	update_rtt_min(w);
 }
 
 /*
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
 	return w->cumul_ack;
 }
 
-static inline u32 westwood_bw_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct westwood *w = inet_csk_ca(sk);
-	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
-}
 
 /*
  * TCP Westwood
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
  * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
  * so avoids ever returning 0.
  */
-static u32 tcp_westwood_cwnd_min(struct sock *sk)
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 {
-	return westwood_bw_rttmin(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 		break;
 
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_FRTO:
-		tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ 		/* Update RTT_min when next ack arrives */
+		w->reset_rtt_min = 1;
 		break;
 
 	case CA_EVENT_SLOW_ACK:
 		westwood_update_window(sk);
 		w->bk += westwood_acked_count(sk);
-		w->rtt_min = min(w->rtt, w->rtt_min);
+		update_rtt_min(w);
 		break;
 
 	default:
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 	.init		= tcp_westwood_init,
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
-	.min_cwnd	= tcp_westwood_cwnd_min,
+	.min_cwnd	= tcp_westwood_bw_rttmin,
 	.cwnd_event	= tcp_westwood_event,
 	.get_info	= tcp_westwood_info,
 	.pkts_acked	= tcp_westwood_pkts_acked,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe7..817ed84511a6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
 
 EXPORT_SYMBOL(xfrm4_rcv);
 
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct iphdr *outer_iph = skb->nh.iph;
-	struct iphdr *inner_iph = skb->h.ipiph;
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 {
 	switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 
 		xfrm_vec[xfrm_nr++] = x;
 
-		iph = skb->nh.iph;
+		if (x->mode->input(x, skb))
+			goto drop;
 
 		if (x->props.mode) {
-			if (iph->protocol != IPPROTO_IPIP)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv4_copy_dscp(iph, skb->h.ipiph);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
-			memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000000..a9e6b3dd19c9
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,83 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x;
+	struct iphdr *iph;
+	int ihl;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	ihl = iph->ihl * 4;
+	skb->h.raw += ihl;
+
+	x = skb->dst->xfrm;
+	skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
+	return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is.  skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int ihl = skb->data - skb->h.raw;
+
+	if (skb->h.raw != skb->nh.raw)
+		skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
+	skb->nh.iph->tot_len = htons(skb->len + ihl);
+	skb->h.raw = skb->data;
+	return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+	.input = xfrm4_transport_input,
+	.output = xfrm4_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000000..f8d880beb12f
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *outer_iph = skb->nh.iph;
+	struct iphdr *inner_iph = skb->h.ipiph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct iphdr *iph, *top_iph;
+	int flags;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	skb->nh.raw = skb_push(skb, x->props.header_len);
+	top_iph = skb->nh.iph;
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (iph->frag_off & htons(IP_DF));
+	if (!top_iph->frag_off)
+		__ip_select_ident(top_iph, dst->child, 0);
+
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	top_iph->protocol = IPPROTO_IPIP;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	return 0;
+}
+
+static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	int err = -EINVAL;
+
+	if (iph->protocol != IPPROTO_IPIP)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv4_copy_dscp(iph, skb->h.ipiph);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+	.input = xfrm4_tunnel_input,
+	.output = xfrm4_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm4_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_tunnel_init);
+module_exit(xfrm4_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a67..ac9d91d4bb05 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header will be moved forward to make space
- * for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	tot_len
- *	check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static void xfrm4_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct iphdr *iph, *top_iph;
-	int flags;
-
-	iph = skb->nh.iph;
-	skb->h.ipiph = iph;
-
-	skb->nh.raw = skb_push(skb, x->props.header_len);
-	top_iph = skb->nh.iph;
-
-	if (!x->props.mode) {
-		skb->h.raw += iph->ihl*4;
-		memmove(top_iph, iph, iph->ihl*4);
-		return;
-	}
-
-	top_iph->ihl = 5;
-	top_iph->version = 4;
-
-	/* DS disclosed */
-	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-
-	flags = x->props.flags;
-	if (flags & XFRM_STATE_NOECN)
-		IP_ECN_clear(top_iph);
-
-	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-		0 : (iph->frag_off & htons(IP_DF));
-	if (!top_iph->frag_off)
-		__ip_select_ident(top_iph, dst->child, 0);
-
-	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-	top_iph->protocol = IPPROTO_IPIP;
-
-	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
-}
-
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm4_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca5..c0465284dfac 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
-
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
 	return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
 
 static inline int xfrm4_garbage_collect(void)
 {
-	read_lock(&xfrm4_policy_afinfo.lock);
 	xfrm4_policy_afinfo.garbage_collect();
-	read_unlock(&xfrm4_policy_afinfo.lock);
 	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
 }
 
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
-	.lock = 		RW_LOCK_UNLOCKED,
-	.type_map = 		&xfrm4_type_map,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.find_bundle = 		__xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7b..81e1751c966e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
-	.lock			= RW_LOCK_UNLOCKED,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,