From af2681828af5f2b42e12e8b16ba0cf113cf486c8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 3 Apr 2008 12:52:19 -0700
Subject: [ICMP]: Ensure that ICMP relookup maintains status quo

The ICMP relookup path is only meant to modify behaviour when
appropriate IPsec policies are in place and marked as requiring
relookups.  It is certainly not meant to modify behaviour when
IPsec policies don't exist at all.

However, due to an oversight on the error paths existing behaviour
may in fact change should one of the relookup steps fail.

This patch corrects this by redirecting all errors on relookup
failures to the previous code path.  That is, if the initial
xfrm_lookup let the packet pass, we will stand by that decision
should the relookup fail due to an error.

This should be safe from a security point-of-view because compliant
systems must install a default deny policy so the packet would'nt
have passed in that case.

Many thanks to Julian Anastasov for pointing out this error.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 24 +++++++++++++-----------
 net/ipv6/icmp.c | 22 ++++++++++++----------
 2 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a944e8053e28..40508babad8c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -591,7 +591,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 
 		if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
-			goto ende;
+			goto relookup_failed;
 
 		if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
 			err = __ip_route_output_key(net, &rt2, &fl);
@@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 			fl2.fl4_dst = fl.fl4_src;
 			if (ip_route_output_key(net, &rt2, &fl2))
-				goto ende;
+				goto relookup_failed;
 
 			/* Ugh! */
 			odst = skb_in->dst;
@@ -614,21 +614,23 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 
 		if (err)
-			goto ende;
+			goto relookup_failed;
 
 		err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL,
 				  XFRM_LOOKUP_ICMP);
-		if (err == -ENOENT) {
+		switch (err) {
+		case 0:
+			dst_release(&rt->u.dst);
+			rt = rt2;
+			break;
+		case -EPERM:
+			goto ende;
+		default:
+relookup_failed:
 			if (!rt)
 				goto out_unlock;
-			goto route_done;
+			break;
 		}
-
-		dst_release(&rt->u.dst);
-		rt = rt2;
-
-		if (err)
-			goto out_unlock;
 	}
 
 route_done:
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index f204a7275a0d..893287ecc628 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -436,24 +436,26 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	}
 
 	if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6))
-		goto out_dst_release;
+		goto relookup_failed;
 
 	if (ip6_dst_lookup(sk, &dst2, &fl))
-		goto out_dst_release;
+		goto relookup_failed;
 
 	err = xfrm_lookup(&dst2, &fl, sk, XFRM_LOOKUP_ICMP);
-	if (err == -ENOENT) {
+	switch (err) {
+	case 0:
+		dst_release(dst);
+		dst = dst2;
+		break;
+	case -EPERM:
+		goto out_dst_release;
+	default:
+relookup_failed:
 		if (!dst)
 			goto out;
-		goto route_done;
+		break;
 	}
 
-	dst_release(dst);
-	dst = dst2;
-
-	if (err)
-		goto out;
-
 route_done:
 	if (ipv6_addr_is_multicast(&fl.fl6_dst))
 		hlimit = np->mcast_hops;
-- 
cgit v1.2.1


From 439e23857a21c3a953826eed23c818697a97de1a Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 3 Apr 2008 13:30:17 -0700
Subject: [IPV6]: Event type in addrconf_ifdown is mis-used.

addrconf_ifdown is broken in respect to the usage of how
parameter. This function is called with (event != NETDEV_DOWN) and (2)
on the IPv6 stop.  It the latter case inet6_dev from loopback device
should be destroyed.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e7a1882db048..4fa9da0be19a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2469,7 +2469,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	/* Step 1: remove reference to ipv6 device from parent device.
 		   Do not dev_put!
 	 */
-	if (how == 1) {
+	if (how) {
 		idev->dead = 1;
 
 		/* protected by rtnl_lock */
@@ -2501,12 +2501,12 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	write_lock_bh(&idev->lock);
 
 	/* Step 3: clear flags for stateless addrconf */
-	if (how != 1)
+	if (!how)
 		idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
 
 	/* Step 4: clear address list */
 #ifdef CONFIG_IPV6_PRIVACY
-	if (how == 1 && del_timer(&idev->regen_timer))
+	if (how && del_timer(&idev->regen_timer))
 		in6_dev_put(idev);
 
 	/* clear tempaddr list */
@@ -2543,7 +2543,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	/* Step 5: Discard multicast list */
 
-	if (how == 1)
+	if (how)
 		ipv6_mc_destroy_dev(idev);
 	else
 		ipv6_mc_down(idev);
@@ -2552,7 +2552,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	/* Shot the device (if unregistered) */
 
-	if (how == 1) {
+	if (how) {
 		addrconf_sysctl_unregister(idev);
 		neigh_parms_release(&nd_tbl, idev->nd_parms);
 		neigh_ifdown(&nd_tbl, dev);
-- 
cgit v1.2.1


From eb867579311a9c1e998d6911af056772c400122a Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 3 Apr 2008 13:31:53 -0700
Subject: [IPV6]: inet6_dev on loopback should be kept until namespace stop.

In the other case it will be destroyed when last address will be removed
from lo inside a namespace. This will break IPv6 in several places. The
most obvious one is ip6_dst_ifdown.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4fa9da0be19a..a65935a9afd9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2456,7 +2456,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	ASSERT_RTNL();
 
-	if (dev == init_net.loopback_dev && how == 1)
+	if ((dev->flags & IFF_LOOPBACK) && how == 1)
 		how = 0;
 
 	rt6_ifdown(dev);
-- 
cgit v1.2.1


From 84f59370c519449c70dcc813b050f5cbbf0098e7 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 3 Apr 2008 13:33:00 -0700
Subject: [IPV6]: Fix refcounting for anycast dst entries.

Anycast DST entries allocated inside ipv6_dev_ac_inc are leaked when
network device is stopped without removing IPv6 addresses from it. The
bug has been observed in the reality on 2.6.18-rhel5 kernel.

In the above case addrconf_ifdown marks all entries as obsolete and
ip6_del_rt called from __ipv6_dev_ac_dec returns ENOENT. The
referrence is not dropped.

The fix is simple. DST entry should not keep referrence when stored in
the FIB6 tree.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/anycast.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 9c7f83fbc3a1..e5f56c953b58 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -334,9 +334,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
 	idev->ac_list = aca;
 	write_unlock_bh(&idev->lock);
 
-	dst_hold(&rt->u.dst);
-	if (ip6_ins_rt(rt))
-		dst_release(&rt->u.dst);
+	ip6_ins_rt(rt);
 
 	addrconf_join_solict(dev, &aca->aca_addr);
 
@@ -378,10 +376,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
 	dst_hold(&aca->aca_rt->u.dst);
-	if (ip6_del_rt(aca->aca_rt))
-		dst_free(&aca->aca_rt->u.dst);
-	else
-		dst_release(&aca->aca_rt->u.dst);
+	ip6_del_rt(aca->aca_rt);
 
 	aca_put(aca);
 	return 0;
-- 
cgit v1.2.1


From 23556323b22fef35bdc36465b7e7439ba3748c9f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 4 Apr 2008 12:45:12 -0700
Subject: [VLAN]: Fix egress priority mappings leak.

These entries are allocated in vlan_dev_set_egress_priority,
but are never released and leaks on vlan device removal.

Drop these in vlan's ->uninit callback - after the device is
brought down and everyone is notified about it is going to
be unregistered.

Found during testing vlan netnsization patchset.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 480ea90e7dcd..41a76a05e6fd 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -692,6 +692,20 @@ static int vlan_dev_init(struct net_device *dev)
 	return 0;
 }
 
+static void vlan_dev_uninit(struct net_device *dev)
+{
+	struct vlan_priority_tci_mapping *pm;
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+		while ((pm = vlan->egress_priority_map[i]) != NULL) {
+			vlan->egress_priority_map[i] = pm->next;
+			kfree(pm);
+		}
+	}
+}
+
 void vlan_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -701,6 +715,7 @@ void vlan_setup(struct net_device *dev)
 
 	dev->change_mtu		= vlan_dev_change_mtu;
 	dev->init		= vlan_dev_init;
+	dev->uninit		= vlan_dev_uninit;
 	dev->open		= vlan_dev_open;
 	dev->stop		= vlan_dev_stop;
 	dev->set_mac_address	= vlan_dev_set_mac_address;
-- 
cgit v1.2.1


From 16f2e85d3151efa643879fa5aa87c9d77d60f57e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 7 Apr 2008 14:35:46 +0200
Subject: nl80211: fix STA AID bug

This fixes the STA AID setting and actually makes hostapd/mac80211
work properly in presence of power-saving stations.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e3a214f63f91..f68a5c8f2147 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -945,7 +945,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
 	params.listen_interval =
 		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
-	params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+	params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
 
 	if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS],
 				&params.station_flags))
-- 
cgit v1.2.1


From 1b69d745397eac12b3f8a2eb6b799cd476aef282 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Mon, 7 Apr 2008 22:31:38 -0700
Subject: [TCP]: Restore 2.6.24 mark_head_lost behavior for newreno/fack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fast retransmission can be forced locally to the rfc3517
branch in tcp_update_scoreboard instead of making such fragile
constructs deeper in tcp_mark_head_lost.

This is necessary for the next patch which must not have
loopholes for cnt > packets check. As one can notice,
readability got some improvements too because of this :-).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7facdb0f6960..5573202f0861 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2134,7 +2134,7 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
+static void tcp_mark_head_lost(struct sock *sk, int packets)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -2161,7 +2161,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
 		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 			cnt += tcp_skb_pcount(skb);
 
-		if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) ||
+		if ((cnt > packets) ||
 		    after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
 			break;
 		if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
@@ -2180,17 +2180,17 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_is_reno(tp)) {
-		tcp_mark_head_lost(sk, 1, fast_rexmit);
+		tcp_mark_head_lost(sk, 1);
 	} else if (tcp_is_fack(tp)) {
 		int lost = tp->fackets_out - tp->reordering;
 		if (lost <= 0)
 			lost = 1;
-		tcp_mark_head_lost(sk, lost, fast_rexmit);
+		tcp_mark_head_lost(sk, lost);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
-		if (sacked_upto < 0)
-			sacked_upto = 0;
-		tcp_mark_head_lost(sk, sacked_upto, fast_rexmit);
+		if (sacked_upto < fast_rexmit)
+			sacked_upto = fast_rexmit;
+		tcp_mark_head_lost(sk, sacked_upto);
 	}
 
 	/* New heuristics: it is possible only after we switched
@@ -2524,7 +2524,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	    before(tp->snd_una, tp->high_seq) &&
 	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
-		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
+		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
 	}
 
-- 
cgit v1.2.1


From c137f3dda04b0aee1bc6889cdc69185f53df8a82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Mon, 7 Apr 2008 22:32:38 -0700
Subject: [TCP]: Fix NewReno's fast rexmit/recovery problems with GSOed skb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes a long-standing bug which makes NewReno recovery crippled.
With GSO the whole head skb was marked as LOST which is in
violation of NewReno procedure that only wants to mark one packet
and ended up breaking our TCP code by causing counter overflow
because our code was built on top of assumption about valid
NewReno procedure. This manifested as triggering a WARN_ON for
the overflow in a number of places.

It seems relatively safe alternative to just do nothing if
tcp_fragment fails due to oom because another duplicate ACK is
likely to be received soon and the fragmentation will be retried.

Special thanks goes to Soeren Sonnenburg <kernel@nn7.de> who was
lucky enough to be able to reproduce this so that the warning
for the overflow was hit. It's not as easy task as it seems even
if this bug happens quite often because the amount of outstanding
data is pretty significant for the mismarkings to lead to an
overflow.

Because it's very late in 2.6.25-rc cycle (if this even makes in
time), I didn't want to touch anything with SACK enabled here.
Fragmenting might be useful for it as well but it's more or less
a policy decision rather than mandatory fix. Thus there's no need
to rush and we can postpone considering tcp_fragment with SACK
for 2.6.26.

In 2.6.24 and earlier, this very same bug existed but the effect
is slightly different because of a small changes in the if
conditions that fit to the patch's context. With them nothing
got lost marker and thus no retransmissions happened.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5573202f0861..7d0958785bfb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2138,7 +2138,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int cnt;
+	int cnt, oldcnt;
+	int err;
+	unsigned int mss;
 
 	BUG_TRAP(packets <= tp->packets_out);
 	if (tp->lost_skb_hint) {
@@ -2157,13 +2159,25 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 		tp->lost_skb_hint = skb;
 		tp->lost_cnt_hint = cnt;
 
+		if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+			break;
+
+		oldcnt = cnt;
 		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
 		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 			cnt += tcp_skb_pcount(skb);
 
-		if ((cnt > packets) ||
-		    after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
-			break;
+		if (cnt > packets) {
+			if (tcp_is_sack(tp) || (oldcnt >= packets))
+				break;
+
+			mss = skb_shinfo(skb)->gso_size;
+			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+			if (err < 0)
+				break;
+			cnt = packets;
+		}
+
 		if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-- 
cgit v1.2.1


From 882bebaaca4bb1484078d44ef011f918c0e1e14e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Mon, 7 Apr 2008 22:33:07 -0700
Subject: [TCP]: tcp_simple_retransmit can cause S+L
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes Bugzilla #10384

tcp_simple_retransmit does L increment without any checking
whatsoever for overflowing S+L when Reno is in use.

The simplest scenario I can currently think of is rather
complex in practice (there might be some more straightforward
cases though). Ie., if mss is reduced during mtu probing, it
may end up marking everything lost and if some duplicate ACKs
arrived prior to that sacked_out will be non-zero as well,
leading to S+L > packets_out, tcp_clean_rtx_queue on the next
cumulative ACK or tcp_fastretrans_alert on the next duplicate
ACK will fix the S counter.

More straightforward (but questionable) solution would be to
just call tcp_reset_reno_sack() in tcp_simple_retransmit but
it would negatively impact the probe's retransmission, ie.,
the retransmissions would not occur if some duplicate ACKs
had arrived.

So I had to add reno sacked_out reseting to CA_Loss state
when the first cumulative ACK arrives (this stale sacked_out
might actually be the explanation for the reports of left_out
overflows in kernel prior to 2.6.23 and S+L overflow reports
of 2.6.24). However, this alone won't be enough to fix kernel
before 2.6.24 because it is building on top of the commit
1b6d427bb7e ([TCP]: Reduce sacked_out with reno when purging
write_queue) to keep the sacked_out from overflowing.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Reported-by: Alessandro Suardi <alessandro.suardi@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 24 ++++++++++++++++++------
 net/ipv4/tcp_output.c |  3 +++
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7d0958785bfb..b4812c3cbbcf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1625,13 +1625,11 @@ out:
 	return flag;
 }
 
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
  */
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+int tcp_limit_reno_sacked(struct tcp_sock *tp)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	u32 holes;
 
 	holes = max(tp->lost_out, 1U);
@@ -1639,8 +1637,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 
 	if ((tp->sacked_out + holes) > tp->packets_out) {
 		tp->sacked_out = tp->packets_out - holes;
-		tcp_update_reordering(sk, tp->packets_out + addend, 0);
+		return 1;
 	}
+	return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	if (tcp_limit_reno_sacked(tp))
+		tcp_update_reordering(sk, tp->packets_out + addend, 0);
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -2600,6 +2610,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	case TCP_CA_Loss:
 		if (flag & FLAG_DATA_ACKED)
 			icsk->icsk_retransmits = 0;
+		if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
+			tcp_reset_reno_sack(tp);
 		if (!tcp_try_undo_loss(sk)) {
 			tcp_moderate_cwnd(tp);
 			tcp_xmit_retransmit_queue(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 72b9350006fe..d29ef79c00ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1808,6 +1808,9 @@ void tcp_simple_retransmit(struct sock *sk)
 	if (!lost)
 		return;
 
+	if (tcp_is_reno(tp))
+		tcp_limit_reno_sacked(tp);
+
 	tcp_verify_left_out(tp);
 
 	/* Don't muck with the congestion window here.
-- 
cgit v1.2.1


From 6adb4f733e9996b4fd68a6db50dd51bd2463ccac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Mon, 7 Apr 2008 22:33:57 -0700
Subject: [TCP]: Don't allow FRTO to take place while MTU is being probed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MTU probe can cause some remedies for FRTO because the normal
packet ordering may be violated allowing FRTO to make a wrong
decision (it might not be that serious threat for anything
though). Thus it's safer to not run FRTO while MTU probe is
underway.

It seems that the basic FRTO variant should also look for an
skb at probe_seq.start to check if that's retransmitted one
but I didn't implement it now (plain seqno in window check
isn't robust against wraparounds).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b4812c3cbbcf..5119856017ab 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1691,11 +1691,16 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 int tcp_use_frto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb;
 
 	if (!sysctl_tcp_frto)
 		return 0;
 
+	/* MTU probe and F-RTO won't really play nicely along currently */
+	if (icsk->icsk_mtup.probe_size)
+		return 0;
+
 	if (IsSackFrto())
 		return 1;
 
-- 
cgit v1.2.1


From 21f644f3eabde637f255f75ad05d0821a7a36b7f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Apr 2008 16:50:44 -0700
Subject: [NET]: Undo code bloat in hot paths due to print_mac().

If print_mac() is used inside of a pr_debug() the compiler
can't see that the call is redundant so still performs it
even of pr_debug() ends up being a nop.

So don't use print_mac() in such cases in hot code paths,
use MAC_FMT et al. instead.

As noted by Joe Perches, pr_debug() could be modified to
handle this better, but that is a change to an interface
used by the entire kernel and thus needs to be validated
carefully.  This here is thus the less risky fix for
2.6.25

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/lec.c                | 29 +++++++++++++++++++----------
 net/ieee80211/ieee80211_rx.c | 43 +++++++++++++++++++++++++++++--------------
 2 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/atm/lec.c b/net/atm/lec.c
index a2efa7ff41f1..3235c57615e4 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -266,7 +266,6 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	char buf[300];
 	int i = 0;
 #endif /* DUMP_PACKETS >0 */
-	DECLARE_MAC_BUF(mac);
 
 	pr_debug("lec_start_xmit called\n");
 	if (!priv->lecd) {
@@ -374,15 +373,19 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) {
 			pr_debug("%s:lec_start_xmit: queuing packet, ",
 				dev->name);
-			pr_debug("MAC address %s\n",
-				 print_mac(mac, lec_h->h_dest));
+			pr_debug("MAC address " MAC_FMT "\n",
+				 lec_h->h_dest[0], lec_h->h_dest[1],
+				 lec_h->h_dest[2], lec_h->h_dest[3],
+				 lec_h->h_dest[4], lec_h->h_dest[5]);
 			skb_queue_tail(&entry->tx_wait, skb);
 		} else {
 			pr_debug
 			    ("%s:lec_start_xmit: tx queue full or no arp entry, dropping, ",
 			     dev->name);
-			pr_debug("MAC address %s\n",
-				 print_mac(mac, lec_h->h_dest));
+			pr_debug("MAC address " MAC_FMT "\n",
+				 lec_h->h_dest[0], lec_h->h_dest[1],
+				 lec_h->h_dest[2], lec_h->h_dest[3],
+				 lec_h->h_dest[4], lec_h->h_dest[5]);
 			priv->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 		}
@@ -394,8 +397,10 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) {
 		pr_debug("lec.c: emptying tx queue, ");
-		pr_debug("MAC address %s\n",
-			 print_mac(mac, lec_h->h_dest));
+		pr_debug("MAC address " MAC_FMT "\n",
+			 lec_h->h_dest[0], lec_h->h_dest[1],
+			 lec_h->h_dest[2], lec_h->h_dest[3],
+			 lec_h->h_dest[4], lec_h->h_dest[5]);
 		lec_send(vcc, skb2, priv);
 	}
 
@@ -449,7 +454,6 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
 	struct lec_arp_table *entry;
 	int i;
 	char *tmp;		/* FIXME */
-	DECLARE_MAC_BUF(mac);
 
 	atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
 	mesg = (struct atmlec_msg *)skb->data;
@@ -536,9 +540,14 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
 			struct net_bridge_fdb_entry *f;
 
 			pr_debug
-			    ("%s: bridge zeppelin asks about %s\n",
+			    ("%s: bridge zeppelin asks about " MAC_FMT "\n",
 			     dev->name,
-			     print_mac(mac, mesg->content.proxy.mac_addr));
+			     mesg->content.proxy.mac_addr[0],
+			     mesg->content.proxy.mac_addr[1],
+			     mesg->content.proxy.mac_addr[2],
+			     mesg->content.proxy.mac_addr[3],
+			     mesg->content.proxy.mac_addr[4],
+			     mesg->content.proxy.mac_addr[5]);
 
 			if (br_fdb_get_hook == NULL || dev->br_port == NULL)
 				break;
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 1e3f87c8c012..200ee1e63728 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -271,7 +271,6 @@ ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
 {
 	struct ieee80211_hdr_3addr *hdr;
 	int res, hdrlen;
-	DECLARE_MAC_BUF(mac);
 
 	if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL)
 		return 0;
@@ -283,8 +282,12 @@ ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
 	res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv);
 	atomic_dec(&crypt->refcnt);
 	if (res < 0) {
-		IEEE80211_DEBUG_DROP("decryption failed (SA=%s"
-				     ") res=%d\n", print_mac(mac, hdr->addr2), res);
+		IEEE80211_DEBUG_DROP("decryption failed (SA=" MAC_FMT
+				     ") res=%d\n",
+				     hdr->addr2[0], hdr->addr2[1],
+				     hdr->addr2[2], hdr->addr2[3],
+				     hdr->addr2[4], hdr->addr2[5],
+				     res);
 		if (res == -2)
 			IEEE80211_DEBUG_DROP("Decryption failed ICV "
 					     "mismatch (key %d)\n",
@@ -304,7 +307,6 @@ ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee,
 {
 	struct ieee80211_hdr_3addr *hdr;
 	int res, hdrlen;
-	DECLARE_MAC_BUF(mac);
 
 	if (crypt == NULL || crypt->ops->decrypt_msdu == NULL)
 		return 0;
@@ -317,8 +319,12 @@ ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee,
 	atomic_dec(&crypt->refcnt);
 	if (res < 0) {
 		printk(KERN_DEBUG "%s: MSDU decryption/MIC verification failed"
-		       " (SA=%s keyidx=%d)\n",
-		       ieee->dev->name, print_mac(mac, hdr->addr2), keyidx);
+		       " (SA=" MAC_FMT " keyidx=%d)\n",
+		       ieee->dev->name,
+		       hdr->addr2[0], hdr->addr2[1],
+		       hdr->addr2[2], hdr->addr2[3],
+		       hdr->addr2[4], hdr->addr2[5],
+		       keyidx);
 		return -1;
 	}
 
@@ -462,8 +468,10 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 			 * frames silently instead of filling system log with
 			 * these reports. */
 			IEEE80211_DEBUG_DROP("Decryption failed (not set)"
-					     " (SA=%s)\n",
-					     print_mac(mac, hdr->addr2));
+					     " (SA=" MAC_FMT ")\n",
+					     hdr->addr2[0], hdr->addr2[1],
+					     hdr->addr2[2], hdr->addr2[3],
+					     hdr->addr2[4], hdr->addr2[5]);
 			ieee->ieee_stats.rx_discards_undecryptable++;
 			goto rx_dropped;
 		}
@@ -474,8 +482,10 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 		    fc & IEEE80211_FCTL_PROTECTED && ieee->host_decrypt &&
 		    (keyidx = hostap_rx_frame_decrypt(ieee, skb, crypt)) < 0) {
 			printk(KERN_DEBUG "%s: failed to decrypt mgmt::auth "
-			       "from %s\n", dev->name,
-			       print_mac(mac, hdr->addr2));
+			       "from " MAC_FMT "\n", dev->name,
+			       hdr->addr2[0], hdr->addr2[1],
+			       hdr->addr2[2], hdr->addr2[3],
+			       hdr->addr2[4], hdr->addr2[5]);
 			/* TODO: could inform hostapd about this so that it
 			 * could send auth failure report */
 			goto rx_dropped;
@@ -653,8 +663,11 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 			 * configured */
 		} else {
 			IEEE80211_DEBUG_DROP("encryption configured, but RX "
-					     "frame not encrypted (SA=%s"
-					     ")\n", print_mac(mac, hdr->addr2));
+					     "frame not encrypted (SA="
+					     MAC_FMT ")\n",
+					     hdr->addr2[0], hdr->addr2[1],
+					     hdr->addr2[2], hdr->addr2[3],
+					     hdr->addr2[4], hdr->addr2[5]);
 			goto rx_dropped;
 		}
 	}
@@ -662,9 +675,11 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 	if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep &&
 	    !ieee80211_is_eapol_frame(ieee, skb)) {
 		IEEE80211_DEBUG_DROP("dropped unencrypted RX data "
-				     "frame from %s"
+				     "frame from " MAC_FMT
 				     " (drop_unencrypted=1)\n",
-				     print_mac(mac, hdr->addr2));
+				     hdr->addr2[0], hdr->addr2[1],
+				     hdr->addr2[2], hdr->addr2[3],
+				     hdr->addr2[4], hdr->addr2[5]);
 		goto rx_dropped;
 	}
 
-- 
cgit v1.2.1


From 216bce90b811a35eb5cd2ed8216bdbb1753e9b2b Mon Sep 17 00:00:00 2001
From: Vladimir Koutny <vlado@work.ksp.sk>
Date: Mon, 31 Mar 2008 17:05:10 +0200
Subject: mac80211: use short_preamble mode from capability if ERP IE not
 present

When associating to a b-only AP where there is no ERP IE, short preamble
mode is left at previous state (probably also protection mode). In this
case, disable protection and use short preamble mode as specified in
capability field. The same is done if capability field is changed on-the-fly.

Signed-off-by: Vladimir Koutny <vlado@ksp.sk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ieee80211_sta.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_sta.c b/net/mac80211/ieee80211_sta.c
index e0c72d04584b..c1706855460a 100644
--- a/net/mac80211/ieee80211_sta.c
+++ b/net/mac80211/ieee80211_sta.c
@@ -312,14 +312,12 @@ static void ieee80211_sta_wmm_params(struct net_device *dev,
 	}
 }
 
-
-static u32 ieee80211_handle_erp_ie(struct ieee80211_sub_if_data *sdata,
-				   u8 erp_value)
+static u32 ieee80211_handle_protect_preamb(struct ieee80211_sub_if_data *sdata,
+					   bool use_protection,
+					   bool use_short_preamble)
 {
 	struct ieee80211_bss_conf *bss_conf = &sdata->bss_conf;
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	bool use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0;
-	bool use_short_preamble = (erp_value & WLAN_ERP_BARKER_PREAMBLE) == 0;
 	DECLARE_MAC_BUF(mac);
 	u32 changed = 0;
 
@@ -350,6 +348,32 @@ static u32 ieee80211_handle_erp_ie(struct ieee80211_sub_if_data *sdata,
 	return changed;
 }
 
+static u32 ieee80211_handle_erp_ie(struct ieee80211_sub_if_data *sdata,
+				   u8 erp_value)
+{
+	bool use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0;
+	bool use_short_preamble = (erp_value & WLAN_ERP_BARKER_PREAMBLE) == 0;
+
+	return ieee80211_handle_protect_preamb(sdata,
+			use_protection, use_short_preamble);
+}
+
+static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
+					   struct ieee80211_sta_bss *bss)
+{
+	u32 changed = 0;
+
+	if (bss->has_erp_value)
+		changed |= ieee80211_handle_erp_ie(sdata, bss->erp_value);
+	else {
+		u16 capab = bss->capability;
+		changed |= ieee80211_handle_protect_preamb(sdata, false,
+				(capab & WLAN_CAPABILITY_SHORT_PREAMBLE) != 0);
+	}
+
+	return changed;
+}
+
 int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
 				   struct ieee80211_ht_info *ht_info)
 {
@@ -468,9 +492,7 @@ static void ieee80211_set_associated(struct net_device *dev,
 					   local->hw.conf.channel,
 					   ifsta->ssid, ifsta->ssid_len);
 		if (bss) {
-			if (bss->has_erp_value)
-				changed |= ieee80211_handle_erp_ie(
-						sdata, bss->erp_value);
+			changed |= ieee80211_handle_bss_capability(sdata, bss);
 			ieee80211_rx_bss_put(dev, bss);
 		}
 
@@ -2116,6 +2138,11 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 
 	if (elems.erp_info && elems.erp_info_len >= 1)
 		changed |= ieee80211_handle_erp_ie(sdata, elems.erp_info[0]);
+	else {
+		u16 capab = le16_to_cpu(mgmt->u.beacon.capab_info);
+		changed |= ieee80211_handle_protect_preamb(sdata, false,
+				(capab & WLAN_CAPABILITY_SHORT_PREAMBLE) != 0);
+	}
 
 	if (elems.ht_cap_elem && elems.ht_info_elem &&
 	    elems.wmm_param && local->ops->conf_ht &&
-- 
cgit v1.2.1


From bcf0dda8d2408fe1c1040cdec5a98e5fcad2ac72 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 9 Apr 2008 15:08:24 -0700
Subject: [XFRM]: xfrm_user: fix selector family initialization

Commit df9dcb45 ([IPSEC]: Fix inter address family IPsec tunnel handling)
broke openswan by removing the selector initialization for tunnel mode
in case it is uninitialized.

This patch restores the initialization, fixing openswan, but probably
breaking inter-family tunnels again (unknown since the patch author
disappeared). The correct thing for inter-family tunnels is probably
to simply initialize the selector family explicitly.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 5d96f2728dc6..019d21de19b3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -288,7 +288,7 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *
 	memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
 	x->props.flags = p->flags;
 
-	if (x->props.mode == XFRM_MODE_TRANSPORT)
+	if (!x->sel.family)
 		x->sel.family = p->family;
 
 }
-- 
cgit v1.2.1


From 1b9b70ea2ebaab26c3e4fed385dfab6fc16359ed Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 9 Apr 2008 15:14:18 -0700
Subject: [NETFILTER]: xt_hashlimit: fix mask calculation

Shifts larger than the data type are undefined, don't try to shift
an u32 by 32. Also remove some special-casing of bitmasks divisible
by 32.

Based on patch by Jan Engelhardt <jengelh@computergmbh.de>.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_hashlimit.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index dc29007c52cd..40d344b21453 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -466,38 +466,25 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
 
 static inline __be32 maskl(__be32 a, unsigned int l)
 {
-	return htonl(ntohl(a) & ~(~(u_int32_t)0 >> l));
+	return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
 }
 
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
 {
 	switch (p) {
-	case 0:
-		i[0] = i[1] = 0;
-		i[2] = i[3] = 0;
-		break;
-	case 1 ... 31:
+	case 0 ... 31:
 		i[0] = maskl(i[0], p);
 		i[1] = i[2] = i[3] = 0;
 		break;
-	case 32:
-		i[1] = i[2] = i[3] = 0;
-		break;
-	case 33 ... 63:
+	case 32 ... 63:
 		i[1] = maskl(i[1], p - 32);
 		i[2] = i[3] = 0;
 		break;
-	case 64:
-		i[2] = i[3] = 0;
-		break;
-	case 65 ... 95:
+	case 64 ... 95:
 		i[2] = maskl(i[2], p - 64);
 		i[3] = 0;
-	case 96:
-		i[3] = 0;
-		break;
-	case 97 ... 127:
+	case 96 ... 127:
 		i[3] = maskl(i[3], p - 96);
 		break;
 	case 128:
-- 
cgit v1.2.1


From 475959d4773e53a2700e523dd30acebbd47556a5 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Wed, 9 Apr 2008 15:14:58 -0700
Subject: [NETFILTER]: nf_nat: autoload IPv4 connection tracking

Without this patch, the generic L3 tracker would kick in
if nf_conntrack_ipv4 was not loaded before nf_nat, which
would lead to translation problems with ICMP errors.

NAT does not make sense without IPv4 connection tracking
anyway, so just add a call to need_ipv4_conntrack().

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 0d5fa3a54d04..36b4e3bb056f 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -629,6 +629,8 @@ static int __init nf_nat_init(void)
 	size_t i;
 	int ret;
 
+	need_ipv4_conntrack();
+
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
-- 
cgit v1.2.1


From 951e07c930f5f66b676eaa4c32a1b0d8e2d7d06a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Apr 2008 01:29:36 -0700
Subject: [IPV4]: Fix byte value boundary check in do_ip_getsockopt().

This fixes kernel bugzilla 10371.

As reported by M.Piechaczek@osmosys.tv, if we try to grab a
char sized socket option value, as in:

  unsigned char ttl = 255;
  socklen_t     len = sizeof(ttl);
  setsockopt(socket, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, &len);

  getsockopt(socket, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, &len);

The ttl returned will be wrong on big-endian, and on both little-
endian and big-endian the next three bytes in userspace are written
with garbage.

It's because of this test in do_ip_getsockopt():

	if (len < sizeof(int) && len > 0 && val>=0 && val<255) {

It should allow a 'val' of 255 to pass here, but it doesn't so it
copies a full 'int' back to userspace.

On little-endian that will write the correct value into the location
but it spams on the next three bytes in userspace.  On big endian it
writes the wrong value into the location and spams the next three
bytes.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f72457b4b0a7..c2921d01e925 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1132,7 +1132,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	}
 	release_sock(sk);
 
-	if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+	if (len < sizeof(int) && len > 0 && val>=0 && val<=255) {
 		unsigned char ucval = (unsigned char)val;
 		len = 1;
 		if (put_user(len, optlen))
-- 
cgit v1.2.1


From 7951f0b03a63d657c72c7d54d306ef3357e7e604 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <dlezcano@fr.ibm.com>
Date: Thu, 10 Apr 2008 20:53:10 -0700
Subject: [NETNS][IPV6] tcp - assign the netns for timewait sockets

Copy the network namespace from the socket to the timewait socket.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Mark Lord <mlord@pobox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 876169f3a528..717c411a5c6b 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -124,6 +124,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_hash	    = sk->sk_hash;
 		tw->tw_ipv6only	    = 0;
 		tw->tw_prot	    = sk->sk_prot_creator;
+		tw->tw_net          = sk->sk_net;
 		atomic_set(&tw->tw_refcnt, 1);
 		inet_twsk_dead_node_init(tw);
 		__module_get(tw->tw_prot->owner);
-- 
cgit v1.2.1


From ae1b6a31b1f9ef2c7ba5ef89799f210a9ba6937c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sat, 12 Apr 2008 18:33:50 -0700
Subject: [AX25]: Potential ax25_uid_assoc-s leaks on module unload.

The ax25_uid_free call walks the ax25_uid_list and releases entries
from it. The problem is that after the fisrt call to hlist_del_init
the hlist_for_each_entry (which hides behind the ax25_uid_for_each)
will consider the current position to be the last and will return.

Thus, the whole list will be left not freed.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/ax25_uid.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 5f4eb73fb9d3..57aeba729bae 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -218,9 +218,11 @@ void __exit ax25_uid_free(void)
 	struct hlist_node *node;
 
 	write_lock(&ax25_uid_lock);
+again:
 	ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
 		hlist_del_init(&ax25_uid->uid_node);
 		ax25_uid_put(ax25_uid);
+		goto again;
 	}
 	write_unlock(&ax25_uid_lock);
 }
-- 
cgit v1.2.1


From 028b027524b162eef90839a92ba4b8bddf23e06c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 12 Apr 2008 18:35:41 -0700
Subject: [DCCP]: Fix skb->cb conflicts with IP

dev_queue_xmit() and the other IP output functions expect to get a skb
with clear or properly initialized skb->cb. Unlike TCP and UDP, the
dccp_skb_cb doesn't contain a struct inet_skb_parm at the beginning,
so the DCCP-specific data is interpreted by the IP output functions.
This can cause false negatives for the conditional POST_ROUTING hook
invocation, making the packet bypass the hook.

Add a inet_skb_parm/inet6_skb_parm union to the beginning of
dccp_skb_cb to avoid clashes. Also add a BUILD_BUG_ON to make
sure it fits in the cb.

[ Combined with patch from Gerrit Renker to remove two now unnecessary
  memsets of IPCB(skb)->opt ]

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/dccp.h   | 6 ++++++
 net/dccp/ipv4.c   | 1 -
 net/dccp/output.c | 1 -
 net/dccp/proto.c  | 3 +++
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 287a62bc2e0f..ba2ef94a2302 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -325,6 +325,12 @@ static inline int dccp_bad_service_code(const struct sock *sk,
  * This is used for transmission as well as for reception.
  */
 struct dccp_skb_cb {
+	union {
+		struct inet_skb_parm	h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;
 	__u8  dccpd_type:4;
 	__u8  dccpd_ccval:4;
 	__u8  dccpd_reset_code,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 474075adbde4..b33704415555 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -489,7 +489,6 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 
 		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
 							      ireq->rmt_addr);
-		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 3b763db3d863..3d7d628d870d 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -126,7 +126,6 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 
 		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 
-		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 		err = icsk->icsk_af_ops->queue_xmit(skb, 0);
 		return net_xmit_eval(err);
 	}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e3f5d37b84be..c91d3c1fd30d 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1057,6 +1057,9 @@ static int __init dccp_init(void)
 	int ehash_order, bhash_order, i;
 	int rc = -ENOBUFS;
 
+	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+		     FIELD_SIZEOF(struct sk_buff, cb));
+
 	dccp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("dccp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-- 
cgit v1.2.1


From e56cfad132f2ae269082359d279c17230c987e74 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sat, 12 Apr 2008 18:37:13 -0700
Subject: [NET_SCHED] cls_u32: refcounting fix for u32_delete()

Deleting of nonroot hnodes mostly doesn't work in u32_delete():
refcnt == 1 is expected, but such hnodes' refcnts are initialized
with 0 and charged only with "link" nodes. Now they'll start with
1 like usual. Thanks to Patrick McHardy for an improving suggestion.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Acked-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c5c16b4b6e98..4d755444c449 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -411,8 +411,10 @@ static void u32_destroy(struct tcf_proto *tp)
 			}
 		}
 
-		for (ht=tp_c->hlist; ht; ht = ht->next)
+		for (ht = tp_c->hlist; ht; ht = ht->next) {
+			ht->refcnt--;
 			u32_clear_hnode(tp, ht);
+		}
 
 		while ((ht = tp_c->hlist) != NULL) {
 			tp_c->hlist = ht->next;
@@ -441,8 +443,12 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 	if (tp->root == ht)
 		return -EINVAL;
 
-	if (--ht->refcnt == 0)
+	if (ht->refcnt == 1) {
+		ht->refcnt--;
 		u32_destroy_hnode(tp, ht);
+	} else {
+		return -EBUSY;
+	}
 
 	return 0;
 }
@@ -568,7 +574,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 		if (ht == NULL)
 			return -ENOBUFS;
 		ht->tp_c = tp_c;
-		ht->refcnt = 0;
+		ht->refcnt = 1;
 		ht->divisor = divisor;
 		ht->handle = handle;
 		ht->prio = tp->prio;
-- 
cgit v1.2.1


From 72da7b3860cabf427590b4982bc880bafab4d5c8 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Sat, 12 Apr 2008 18:39:19 -0700
Subject: [SCTP]: Add check for hmac_algo parameter in sctp_verify_param()

RFC 4890 has the following text:

  The HMAC algorithm based on SHA-1 MUST be supported and
  included in the HMAC-ALGO parameter.

As a result, we need to check in sctp_verify_param() that HMAC_SHA1 is
present in the list.  If not, we should probably treat this as a
protocol violation.

It should also be a protocol violation if the HMAC parameter is empty.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 578630e8e00d..36ebb392472e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1982,7 +1982,10 @@ static sctp_ierror_t sctp_verify_param(const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
 					struct sctp_chunk **err_chunk)
 {
+	struct sctp_hmac_algo_param *hmacs;
 	int retval = SCTP_IERROR_NO_ERROR;
+	__u16 n_elt, id = 0;
+	int i;
 
 	/* FIXME - This routine is not looking at each parameter per the
 	 * chunk type, i.e., unrecognized parameters should be further
@@ -2056,9 +2059,29 @@ static sctp_ierror_t sctp_verify_param(const struct sctp_association *asoc,
 		break;
 
 	case SCTP_PARAM_HMAC_ALGO:
-		if (sctp_auth_enable)
-			break;
-		/* Fall Through */
+		if (!sctp_auth_enable)
+			goto fallthrough;
+
+		hmacs = (struct sctp_hmac_algo_param *)param.p;
+		n_elt = (ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) >> 1;
+
+		/* SCTP-AUTH: Section 6.1
+		 * The HMAC algorithm based on SHA-1 MUST be supported and
+		 * included in the HMAC-ALGO parameter.
+		 */
+		for (i = 0; i < n_elt; i++) {
+			id = ntohs(hmacs->hmac_ids[i]);
+
+			if (id == SCTP_AUTH_HMAC_ID_SHA1)
+				break;
+		}
+
+		if (id != SCTP_AUTH_HMAC_ID_SHA1) {
+			sctp_process_inv_paramlength(asoc, param.p, chunk,
+						     err_chunk);
+			retval = SCTP_IERROR_ABORT;
+		}
+		break;
 fallthrough:
 	default:
 		SCTP_DEBUG_PRINTK("Unrecognized param: %d for chunk %d.\n",
-- 
cgit v1.2.1


From f4ad85ca3ef8a1ede76c5020a28a8f4057b4d24f Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Sat, 12 Apr 2008 18:39:34 -0700
Subject: [SCTP]: Fix protocol violation when receiving an error lenght
 INIT-ACK

When receiving an error length INIT-ACK during COOKIE-WAIT,
a 0-vtag ABORT will be responsed. This action violates the
protocol apparently. This patch achieves the following things.
1 If the INIT-ACK contains all the fixed parameters, use init-tag
  recorded from INIT-ACK as vtag.
2 If the INIT-ACK doesn't contain all the fixed parameters,
  just reflect its vtag.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c      |  3 +++
 net/sctp/sm_sideeffect.c |  3 +++
 net/sctp/sm_statefuns.c  | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1bb3c5c35d2a..c0714469233c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -793,6 +793,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			break;
 
 		case SCTP_CID_ABORT:
+			if (sctp_test_T_bit(chunk)) {
+				packet->vtag = asoc->c.my_vtag;
+			}
 		case SCTP_CID_SACK:
 		case SCTP_CID_HEARTBEAT:
 		case SCTP_CID_HEARTBEAT_ACK:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 28eb38eb6083..a4763fd24fd8 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1536,6 +1536,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			error = sctp_auth_asoc_init_active_key(asoc,
 						GFP_ATOMIC);
 			break;
+		case SCTP_CMD_UPDATE_INITTAG:
+			asoc->peer.i.init_tag = cmd->obj.u32;
+			break;
 
 		default:
 			printk(KERN_WARNING "Impossible command: %u, %p\n",
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f2ed6473feef..3ef97499df0d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4144,6 +4144,24 @@ static sctp_disposition_t sctp_sf_abort_violation(
 		goto nomem;
 
 	if (asoc) {
+		/* Treat INIT-ACK as a special case during COOKIE-WAIT. */
+		if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK &&
+		    !asoc->peer.i.init_tag) {
+			sctp_initack_chunk_t *initack;
+
+			initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+			if (!sctp_chunk_length_valid(chunk,
+						     sizeof(sctp_initack_chunk_t)))
+				abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T;
+			else {
+				unsigned int inittag;
+
+				inittag = ntohl(initack->init_hdr.init_tag);
+				sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_INITTAG,
+						SCTP_U32(inittag));
+			}
+		}
+
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
 		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
 
-- 
cgit v1.2.1


From ab38fb04c9f8928cfaf6f4966633d783419906a1 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sat, 12 Apr 2008 18:40:06 -0700
Subject: [SCTP]: Fix compiler warning about const qualifiers

Fix 3 warnings about discarding const qualifiers:

net/sctp/ulpevent.c:862: warning: passing argument 1 of 'sctp_event2skb' discards qualifiers from pointer target type
net/sctp/sm_statefuns.c:4393: warning: passing argument 1 of 'SCTP_ASOC' discards qualifiers from pointer target type
net/sctp/socket.c:5874: warning: passing argument 1 of 'cmsg_nxthdr' discards qualifiers from pointer target type

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 5 +++--
 net/sctp/socket.c       | 5 +++--
 net/sctp/ulpevent.c     | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3ef97499df0d..07194c2a32df 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4367,6 +4367,7 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
 				       sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *repl;
+	struct sctp_association* my_asoc;
 
 	/* The comment below says that we enter COOKIE-WAIT AFTER
 	 * sending the INIT, but that doesn't actually work in our
@@ -4390,8 +4391,8 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
 	/* Cast away the const modifier, as we want to just
 	 * rerun it through as a sideffect.
 	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
-			SCTP_ASOC((struct sctp_association *) asoc));
+	my_asoc = (struct sctp_association *)asoc;
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(my_asoc));
 
 	/* Choose transport for INIT. */
 	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d994d822900d..998e63a31311 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5868,11 +5868,12 @@ SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg,
 				  sctp_cmsgs_t *cmsgs)
 {
 	struct cmsghdr *cmsg;
+	struct msghdr *my_msg = (struct msghdr *)msg;
 
 	for (cmsg = CMSG_FIRSTHDR(msg);
 	     cmsg != NULL;
-	     cmsg = CMSG_NXTHDR((struct msghdr*)msg, cmsg)) {
-		if (!CMSG_OK(msg, cmsg))
+	     cmsg = CMSG_NXTHDR(my_msg, cmsg)) {
+		if (!CMSG_OK(my_msg, cmsg))
 			return -EINVAL;
 
 		/* Should we parse this header or ignore?  */
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index b43f1f110f87..ce6cda6b6994 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -859,7 +859,7 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
 	union sctp_notification *notification;
 	struct sk_buff *skb;
 
-	skb = sctp_event2skb((struct sctp_ulpevent *)event);
+	skb = sctp_event2skb(event);
 	notification = (union sctp_notification *) skb->data;
 	return notification->sn_header.sn_type;
 }
-- 
cgit v1.2.1


From a40a7d15ba602b547f56b7b19e0282fe4fc3dee3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sat, 12 Apr 2008 18:40:38 -0700
Subject: [SCTP]: IPv4 vs IPv6 addresses mess in sctp_inet[6]addr_event.

All IP addresses that are present in a system are duplicated on
struct sctp_sockaddr_entry. They are linked in the global list
called sctp_local_addr_list. And this struct unions IPv4 and IPv6
addresses.

So, there can be rare case, when a sockaddr_in.sin_addr coincides
with the corresponding part of the sockaddr_in6 and the notifier
for IPv4 will carry away an IPv6 entry.

The fix is to check the family before comparing the addresses.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c     | 5 +++--
 net/sctp/protocol.c | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index b1e05d719f9b..85f1495e0edc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -110,8 +110,9 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 		spin_lock_bh(&sctp_local_addr_lock);
 		list_for_each_entry_safe(addr, temp,
 					&sctp_local_addr_list, list) {
-			if (ipv6_addr_equal(&addr->a.v6.sin6_addr,
-					     &ifa->addr)) {
+			if (addr->a.sa.sa_family == AF_INET6 &&
+					ipv6_addr_equal(&addr->a.v6.sin6_addr,
+						&ifa->addr)) {
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f90091a1b9ce..c2dd65d9f38d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -647,7 +647,9 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 		spin_lock_bh(&sctp_local_addr_lock);
 		list_for_each_entry_safe(addr, temp,
 					&sctp_local_addr_list, list) {
-			if (addr->a.v4.sin_addr.s_addr == ifa->ifa_local) {
+			if (addr->a.sa.sa_family == AF_INET &&
+					addr->a.v4.sin_addr.s_addr ==
+					ifa->ifa_local) {
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
-- 
cgit v1.2.1


From f37f0afb2916ccf287428983026261db78c7661a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 13 Apr 2008 21:39:26 -0700
Subject: [SOCK] sk_stamp: should be initialized to ktime_set(-1L, 0)

Problem spotted by Andrew Brampton

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 2654c147c004..7a0567b4b2c9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1725,7 +1725,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
-	sk->sk_stamp = ktime_set(-1L, -1L);
+	sk->sk_stamp = ktime_set(-1L, 0);
 
 	atomic_set(&sk->sk_refcnt, 1);
 	atomic_set(&sk->sk_drops, 0);
-- 
cgit v1.2.1


From 4c821d753d5c097babd6609bcd85f08e254a3505 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Sun, 13 Apr 2008 21:52:48 -0700
Subject: [NET]: Fix kernel-doc for skb_segment

The kernel-doc comment for skb_segment is clearly wrong.  This states
what it actually does.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0d0fd28a9041..608701339620 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2131,8 +2131,8 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
  *	@features: features for the output path (see dev->features)
  *
  *	This function performs segmentation on the given skb.  It returns
- *	the segment at the given position.  It returns NULL if there are
- *	no more segments to generate, or when an error is encountered.
+ *	a pointer to the first in a list of new skbs for the segments.
+ *	In case of error it returns ERR_PTR(err).
  */
 struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 {
-- 
cgit v1.2.1


From 2ed9926e16094ad143b96b09c64cba8bcba05ee1 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 13 Apr 2008 22:45:40 -0700
Subject: [NET]: Return more appropriate error from eth_validate_addr().

Paul Bolle wrote:
> http://bugzilla.kernel.org/show_bug.cgi?id=9923 would have been much easier to
> track down if eth_validate_addr() would somehow complain aloud if an address
> is invalid. Shouldn't it make at least some noise?

I guess it should return -EADDRNOTAVAIL similar to eth_mac_addr()
when validation fails.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethernet/eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index a7b417523e9b..a80839b02e3f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -301,7 +301,7 @@ static int eth_change_mtu(struct net_device *dev, int new_mtu)
 static int eth_validate_addr(struct net_device *dev)
 {
 	if (!is_valid_ether_addr(dev->dev_addr))
-		return -EINVAL;
+		return -EADDRNOTAVAIL;
 
 	return 0;
 }
-- 
cgit v1.2.1


From b45e9189c058bfa495073951ff461ee0eea968be Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 13 Apr 2008 23:14:15 -0700
Subject: [IPV6]: Fix ipv6 address fetching in raw6_icmp_error().

Fixes kernel bugzilla 10437

Based almost entirely upon a patch by Dmitry Butskoy.

When deciding what raw sockets to deliver the ICMPv6
to, we should use the addresses in the ICMPv6 quoted
IPV6 header, not the top-level one.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8897ccf8086a..0a6fbc1d1a50 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -372,8 +372,10 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 	read_lock(&raw_v6_hashinfo.lock);
 	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
 	if (sk != NULL) {
-		saddr = &ipv6_hdr(skb)->saddr;
-		daddr = &ipv6_hdr(skb)->daddr;
+		struct ipv6hdr *hdr = (struct ipv6hdr *) skb->data;
+
+		saddr = &hdr->saddr;
+		daddr = &hdr->daddr;
 		net = skb->dev->nd_net;
 
 		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
-- 
cgit v1.2.1


From b077d7ababdb5433aef18c62bf1f785e8729f49a Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sun, 13 Apr 2008 23:42:18 -0700
Subject: [IPV6] ADDRCONF: Ensure disabling multicast RS even if privacy
 extensions are disabled.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a65935a9afd9..b9eeb4f51d48 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -371,6 +371,15 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	 */
 	in6_dev_hold(ndev);
 
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+	if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
+		printk(KERN_INFO
+		       "%s: Disabled Multicast RS\n",
+		       dev->name);
+		ndev->cnf.rtr_solicits = 0;
+	}
+#endif
+
 #ifdef CONFIG_IPV6_PRIVACY
 	setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
 	if ((dev->flags&IFF_LOOPBACK) ||
@@ -383,13 +392,6 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 		       "%s: Disabled Privacy Extensions\n",
 		       dev->name);
 		ndev->cnf.use_tempaddr = -1;
-
-		if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
-			printk(KERN_INFO
-			       "%s: Disabled Multicast RS\n",
-			       dev->name);
-			ndev->cnf.rtr_solicits = 0;
-		}
 	} else {
 		in6_dev_hold(ndev);
 		ipv6_regen_rndid((unsigned long) ndev);
-- 
cgit v1.2.1


From 9625ed72e8bd619c3984f3024bd37143b7f0c7b0 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sun, 13 Apr 2008 23:47:11 -0700
Subject: [IPV6] ADDRCONF: Don't generate temporary address for ip6-ip6
 interface.

As far as I can remember, I was going to disable privacy extensions
on all "tunnel" interfaces.  Disable it on ip6-ip6 interface as well.

Also, just remove ifdefs for SIT for simplicity.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b9eeb4f51d48..e08955baedff 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -384,9 +384,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
 	if ((dev->flags&IFF_LOOPBACK) ||
 	    dev->type == ARPHRD_TUNNEL ||
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+	    dev->type == ARPHRD_TUNNEL6 ||
 	    dev->type == ARPHRD_SIT ||
-#endif
 	    dev->type == ARPHRD_NONE) {
 		printk(KERN_INFO
 		       "%s: Disabled Privacy Extensions\n",
-- 
cgit v1.2.1


From 4dee959723e2bf3a0f9343a46841cd2f0029d424 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 14 Apr 2008 00:44:52 -0700
Subject: [NETFILTER]: ipt_CLUSTERIP: fix race between
 clusterip_config_find_get and _entry_put

Consider we are putting a clusterip_config entry with the "entries"
count == 1, and on the other CPU there's a clusterip_config_find_get
in progress:

CPU1:							CPU2:
clusterip_config_entry_put:				clusterip_config_find_get:
if (atomic_dec_and_test(&c->entries)) {
	/* true */
							read_lock_bh(&clusterip_lock);
							c = __clusterip_config_find(clusterip);
							/* found - it's still in list */
							...
							atomic_inc(&c->entries);
							read_unlock_bh(&clusterip_lock);

	write_lock_bh(&clusterip_lock);
	list_del(&c->list);
	write_unlock_bh(&clusterip_lock);
	...
	dev_put(c->dev);

Oops! We have an entry returned by the clusterip_config_find_get,
which is a) not in list b) has a stale dev pointer.

The problems will happen when the CPU2 will release the entry - it
will remove it from the list for the 2nd time, thus spoiling it, and
will put a stale dev pointer.

The fix is to make atomic_dec_and_test under the clusterip_lock.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 52926c8e3cc1..a12dd329e208 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -82,8 +82,8 @@ clusterip_config_put(struct clusterip_config *c)
 static inline void
 clusterip_config_entry_put(struct clusterip_config *c)
 {
+	write_lock_bh(&clusterip_lock);
 	if (atomic_dec_and_test(&c->entries)) {
-		write_lock_bh(&clusterip_lock);
 		list_del(&c->list);
 		write_unlock_bh(&clusterip_lock);
 
@@ -96,7 +96,9 @@ clusterip_config_entry_put(struct clusterip_config *c)
 #ifdef CONFIG_PROC_FS
 		remove_proc_entry(c->pde->name, c->pde->parent);
 #endif
+		return;
 	}
+	write_unlock_bh(&clusterip_lock);
 }
 
 static struct clusterip_config *
-- 
cgit v1.2.1


From 159d83363b629c91d020734207c1bc788b96af5a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 14 Apr 2008 00:46:01 -0700
Subject: [BRIDGE]: Fix crash in __ip_route_output_key with bridge netfilter

The bridge netfilter code attaches a fake dst_entry with a pointer to a
fake net_device structure to skbs it passes up to IPv4 netfilter. This
leads to crashes when the skb is passed to __ip_route_output_key when
dereferencing the namespace pointer.

Since bridging can currently only operate in the init_net namespace,
the easiest fix for now is to initialize the nd_net pointer of the
fake net_device struct to &init_net.

Should fix bugzilla 10323: http://bugzilla.kernel.org/show_bug.cgi?id=10323

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 1c0efd8ad9f3..af7e8be8d8d2 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -110,7 +110,8 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
  * ipt_REJECT needs it.  Future netfilter modules might
  * require us to fill additional fields. */
 static struct net_device __fake_net_device = {
-	.hard_header_len	= ETH_HLEN
+	.hard_header_len	= ETH_HLEN,
+	.nd_net			= &init_net,
 };
 
 static struct rtable __fake_rtable = {
-- 
cgit v1.2.1