From 4878809f711981a602cc562eb47994fc81ea0155 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Fri, 14 Sep 2007 16:41:03 -0700
Subject: [NET]: Fix two issues wrt. SO_BINDTODEVICE.

1) Comments suggest that setting optlen to zero will unbind
   the socket from whatever device it might be attached to.  This
   hasn't been the case since at least 2.2.x because the first thing
   this function does is return -EINVAL if 'optlen' is less than
   sizeof(int).

   This check also means that passing in a two byte string doesn't
   work so well.  It's almost as if this code was testing with "eth?"
   patterned strings and nothing else :-)

   Fix this by breaking the logic of this facility out into a
   seperate function which validates optlen more appropriately.

   The optlen==0 and small string cases now work properly.

2) We should reset the cached route of the socket after we have made
   the device binding changes, not before.

Reported by Ben Greear.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 106 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index cfed7d42c485..190de61cd648 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -362,6 +362,61 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);
 
+static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+{
+	int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+	char devname[IFNAMSIZ];
+	int index;
+
+	/* Sorry... */
+	ret = -EPERM;
+	if (!capable(CAP_NET_RAW))
+		goto out;
+
+	ret = -EINVAL;
+	if (optlen < 0)
+		goto out;
+
+	/* Bind this socket to a particular device like "eth0",
+	 * as specified in the passed interface name. If the
+	 * name is "" or the option length is zero the socket
+	 * is not bound.
+	 */
+	if (optlen > IFNAMSIZ - 1)
+		optlen = IFNAMSIZ - 1;
+	memset(devname, 0, sizeof(devname));
+
+	ret = -EFAULT;
+	if (copy_from_user(devname, optval, optlen))
+		goto out;
+
+	if (devname[0] == '\0') {
+		index = 0;
+	} else {
+		struct net_device *dev = dev_get_by_name(devname);
+
+		ret = -ENODEV;
+		if (!dev)
+			goto out;
+
+		index = dev->ifindex;
+		dev_put(dev);
+	}
+
+	lock_sock(sk);
+	sk->sk_bound_dev_if = index;
+	sk_dst_reset(sk);
+	release_sock(sk);
+
+	ret = 0;
+
+out:
+#endif
+
+	return ret;
+}
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -390,6 +445,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	}
 #endif
 
+	if (optname == SO_BINDTODEVICE)
+		return sock_bindtodevice(sk, optval, optlen);
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -578,54 +636,6 @@ set_rcvbuf:
 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 		break;
 
-#ifdef CONFIG_NETDEVICES
-	case SO_BINDTODEVICE:
-	{
-		char devname[IFNAMSIZ];
-
-		/* Sorry... */
-		if (!capable(CAP_NET_RAW)) {
-			ret = -EPERM;
-			break;
-		}
-
-		/* Bind this socket to a particular device like "eth0",
-		 * as specified in the passed interface name. If the
-		 * name is "" or the option length is zero the socket
-		 * is not bound.
-		 */
-
-		if (!valbool) {
-			sk->sk_bound_dev_if = 0;
-		} else {
-			if (optlen > IFNAMSIZ - 1)
-				optlen = IFNAMSIZ - 1;
-			memset(devname, 0, sizeof(devname));
-			if (copy_from_user(devname, optval, optlen)) {
-				ret = -EFAULT;
-				break;
-			}
-
-			/* Remove any cached route for this socket. */
-			sk_dst_reset(sk);
-
-			if (devname[0] == '\0') {
-				sk->sk_bound_dev_if = 0;
-			} else {
-				struct net_device *dev = dev_get_by_name(devname);
-				if (!dev) {
-					ret = -ENODEV;
-					break;
-				}
-				sk->sk_bound_dev_if = dev->ifindex;
-				dev_put(dev);
-			}
-		}
-		break;
-	}
-#endif
-
-
 	case SO_ATTACH_FILTER:
 		ret = -EINVAL;
 		if (optlen == sizeof(struct sock_fprog)) {
-- 
cgit v1.2.1


From e1e992e52faa588667e1378a2573b4b8e3fa6670 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Wed, 12 Sep 2007 16:32:59 +0200
Subject: [NET_SCHED] protect action config/dump from irqs

(with no apologies to C Heston)

On Mon, 2007-10-09 at 21:00 +0800, Herbert Xu wrote:
On Sun, Sep 02, 2007 at 01:11:29PM +0000, Christian Kujau wrote:
> >
> > after upgrading to 2.6.23-rc5 (and applying davem's fix [0]), lockdep
> > was quite noisy when I tried to shape my external (wireless) interface:
> >
> > [ 6400.534545] FahCore_78.exe/3552 just changed the state of lock:
> > [ 6400.534713]  (&dev->ingress_lock){-+..}, at: [<c038d595>]
> > netif_receive_skb+0x2d5/0x3c0
> > [ 6400.534941] but this lock took another, soft-read-irq-unsafe lock in the
> > past:
> > [ 6400.535145]  (police_lock){-.--}
>
> This is a genuine dead-lock.  The police lock can be taken
> for reading with softirqs on.  If a second CPU tries to take
> the police lock for writing, while holding the ingress lock,
> then a softirq on the first CPU can dead-lock when it tries
> to get the ingress lock.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c    | 8 ++++----
 net/sched/act_police.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index feef366cad5d..72cdb0fade20 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -68,7 +68,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
 	int err = 0, index = -1,i = 0, s_i = 0, n_i = 0;
 	struct rtattr *r ;
 
-	read_lock(hinfo->lock);
+	read_lock_bh(hinfo->lock);
 
 	s_i = cb->args[0];
 
@@ -96,7 +96,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
 		}
 	}
 done:
-	read_unlock(hinfo->lock);
+	read_unlock_bh(hinfo->lock);
 	if (n_i)
 		cb->args[0] += n_i;
 	return n_i;
@@ -156,13 +156,13 @@ struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 {
 	struct tcf_common *p;
 
-	read_lock(hinfo->lock);
+	read_lock_bh(hinfo->lock);
 	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
 	     p = p->tcfc_next) {
 		if (p->tcfc_index == index)
 			break;
 	}
-	read_unlock(hinfo->lock);
+	read_unlock_bh(hinfo->lock);
 
 	return p;
 }
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 6085be578459..17f6f27e28a2 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -56,7 +56,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct rtattr *r;
 
-	read_lock(&police_lock);
+	read_lock_bh(&police_lock);
 
 	s_i = cb->args[0];
 
@@ -85,7 +85,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c
 		}
 	}
 done:
-	read_unlock(&police_lock);
+	read_unlock_bh(&police_lock);
 	if (n_i)
 		cb->args[0] += n_i;
 	return n_i;
-- 
cgit v1.2.1


From 3ef9d943d26dea764f4fecf3767001c90b778b0c Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 14 Sep 2007 16:45:40 -0700
Subject: [IPV6]: Fix unbalanced socket reference with MSG_CONFIRM.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e27383d855de..77167afa3455 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -882,11 +882,10 @@ back_from_confirm:
 			ip6_flush_pending_frames(sk);
 		else if (!(msg->msg_flags & MSG_MORE))
 			err = rawv6_push_pending_frames(sk, &fl, rp);
+		release_sock(sk);
 	}
 done:
 	dst_release(dst);
-	if (!inet->hdrincl)
-		release_sock(sk);
 out:
 	fl6_sock_release(flowlabel);
 	return err<0?err:len;
-- 
cgit v1.2.1


From cd562c9859f648d78224e9fc0dafa5a3d5000fdb Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 14 Sep 2007 17:15:01 -0700
Subject: [IPV6]: Just increment OutDatagrams once per a datagram.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4210951edb6e..c347f3e30e2e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -555,6 +555,8 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 out:
 	up->len = 0;
 	up->pending = 0;
+	if (!err)
+		UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag);
 	return err;
 }
 
@@ -823,10 +825,8 @@ do_append_data:
 	release_sock(sk);
 out:
 	fl6_sock_release(flowlabel);
-	if (!err) {
-		UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite);
+	if (!err)
 		return len;
-	}
 	/*
 	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
 	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
-- 
cgit v1.2.1


From 2a0c6c980d209827e5d69017fa736f107a744d1d Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 14 Sep 2007 17:15:19 -0700
Subject: [IPV4]: Just increment OutDatagrams once per a datagram.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 28355350fb62..69d4bd10f9c6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -505,6 +505,8 @@ send:
 out:
 	up->len = 0;
 	up->pending = 0;
+	if (!err)
+		UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag);
 	return err;
 }
 
@@ -693,10 +695,8 @@ out:
 	ip_rt_put(rt);
 	if (free)
 		kfree(ipc.opt);
-	if (!err) {
-		UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite);
+	if (!err)
 		return len;
-	}
 	/*
 	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
 	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
-- 
cgit v1.2.1


From 6ae5f983cf8de769214d2d9e8a783c881eccd4cd Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Sun, 16 Sep 2007 14:48:21 -0700
Subject: [IPV6]: Fix source address selection.

The commit 95c385 broke proper source address selection for cases in which
there is a address which is makred 'deprecated'. The commit mistakenly
changed ifa->flags to ifa_result->flags (probably copy/paste error from a
few lines above) in the 'Rule 3' address selection code.

The patch restores the previous RFC-compliant behavior.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 91ef3be5abad..45b4c82148a0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1021,7 +1021,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
 				hiscore.rule++;
 			}
 			if (ipv6_saddr_preferred(score.addr_type) ||
-			   (((ifa_result->flags &
+			   (((ifa->flags &
 			    (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)) == 0))) {
 				score.attrs |= IPV6_SADDR_SCORE_PREFERRED;
 				if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) {
-- 
cgit v1.2.1


From ce5d0b47f13f83dfb9fbb8ac91adad7120747aaf Mon Sep 17 00:00:00 2001
From: Adit Ranadive <adit.262@gmail.com>
Date: Sun, 16 Sep 2007 14:52:15 -0700
Subject: [PKTGEN]: srcmac fix

From: Adit Ranadive <adit.262@gmail.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 36fdea71d742..803d0c8826af 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -111,6 +111,9 @@
  *
  * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
  *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
  */
 #include <linux/sys.h>
 #include <linux/types.h>
@@ -1451,8 +1454,11 @@ static ssize_t pktgen_if_write(struct file *file,
 	}
 	if (!strcmp(name, "src_mac")) {
 		char *v = valstr;
+		unsigned char old_smac[ETH_ALEN];
 		unsigned char *m = pkt_dev->src_mac;
 
+		memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN);
+
 		len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
 		if (len < 0) {
 			return len;
@@ -1481,6 +1487,10 @@ static ssize_t pktgen_if_write(struct file *file,
 			}
 		}
 
+		/* Set up Src MAC */
+		if (compare_ether_addr(old_smac, pkt_dev->src_mac))
+			memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN);
+
 		sprintf(pg_result, "OK: srcmac");
 		return count;
 	}
-- 
cgit v1.2.1


From ddeee3ce7fbf0e800f2a26a76d6018b42b337cc2 Mon Sep 17 00:00:00 2001
From: Satyam Sharma <satyam@infradead.org>
Date: Sun, 16 Sep 2007 14:54:05 -0700
Subject: [PKT_SCHED]: sch_cbq.c: Shut up uninitialized variable warning

net/sched/sch_cbq.c: In function 'cbq_enqueue':
net/sched/sch_cbq.c:383: warning: 'ret' may be used uninitialized in this function

has been verified to be a bogus case. So let's shut it up.

Signed-off-by: Satyam Sharma <satyam@infradead.org>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cbq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index e38c2839b25c..cbef3bbfc20f 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -380,7 +380,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	int len = skb->len;
-	int ret;
+	int uninitialized_var(ret);
 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 
 #ifdef CONFIG_NET_CLS_ACT
-- 
cgit v1.2.1


From 293035479942400a7fe8e4f72465d4e4e466b91a Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sun, 16 Sep 2007 16:02:12 -0700
Subject: [SCTP]: Add RCU synchronization around sctp_localaddr_list

sctp_localaddr_list is modified dynamically via NETDEV_UP
and NETDEV_DOWN events, but there is not synchronization
between writer (even handler) and readers.  As a result,
the readers can access an entry that has been freed and
crash the sytem.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Sridhar Samdurala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |  1 +
 include/net/sctp/structs.h |  6 ++++++
 net/sctp/bind_addr.c       |  2 ++
 net/sctp/ipv6.c            | 34 ++++++++++++++++++++---------
 net/sctp/protocol.c        | 54 +++++++++++++++++++++++++++++++++-------------
 net/sctp/socket.c          | 38 +++++++++++++++++++++-----------
 6 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d529045c1679..c9cc00c85782 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -123,6 +123,7 @@
  * sctp/protocol.c
  */
 extern struct sock *sctp_get_ctl_sock(void);
+extern void sctp_local_addr_free(struct rcu_head *head);
 extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
 				     sctp_scope_t, gfp_t gfp,
 				     int flags);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c0d5848c33dc..a89e36197afb 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -207,6 +207,9 @@ extern struct sctp_globals {
 	 * It is a list of sctp_sockaddr_entry.
 	 */
 	struct list_head local_addr_list;
+
+	/* Lock that protects the local_addr_list writers */
+	spinlock_t addr_list_lock;
 	
 	/* Flag to indicate if addip is enabled. */
 	int addip_enable;
@@ -242,6 +245,7 @@ extern struct sctp_globals {
 #define sctp_port_alloc_lock		(sctp_globals.port_alloc_lock)
 #define sctp_port_hashtable		(sctp_globals.port_hashtable)
 #define sctp_local_addr_list		(sctp_globals.local_addr_list)
+#define sctp_local_addr_lock		(sctp_globals.addr_list_lock)
 #define sctp_addip_enable		(sctp_globals.addip_enable)
 #define sctp_prsctp_enable		(sctp_globals.prsctp_enable)
 
@@ -737,8 +741,10 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
 /* This is a structure for holding either an IPv6 or an IPv4 address.  */
 struct sctp_sockaddr_entry {
 	struct list_head list;
+	struct rcu_head	rcu;
 	union sctp_addr a;
 	__u8 use_as_src;
+	__u8 valid;
 };
 
 typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index fdb287a9e2e2..7fc369f9035d 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -163,8 +163,10 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
 		addr->a.v4.sin_port = htons(bp->port);
 
 	addr->use_as_src = use_as_src;
+	addr->valid = 1;
 
 	INIT_LIST_HEAD(&addr->list);
+	INIT_RCU_HEAD(&addr->rcu);
 	list_add_tail(&addr->list, &bp->address_list);
 	SCTP_DBG_OBJCNT_INC(addr);
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f8aa23dda1c1..e12fa0a91da4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -77,13 +77,18 @@
 
 #include <asm/uaccess.h>
 
-/* Event handler for inet6 address addition/deletion events.  */
+/* Event handler for inet6 address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
 static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 				void *ptr)
 {
 	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
-	struct sctp_sockaddr_entry *addr;
-	struct list_head *pos, *temp;
+	struct sctp_sockaddr_entry *addr = NULL;
+	struct sctp_sockaddr_entry *temp;
 
 	switch (ev) {
 	case NETDEV_UP:
@@ -94,19 +99,26 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 			memcpy(&addr->a.v6.sin6_addr, &ifa->addr,
 				 sizeof(struct in6_addr));
 			addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
-			list_add_tail(&addr->list, &sctp_local_addr_list);
+			addr->valid = 1;
+			spin_lock_bh(&sctp_local_addr_lock);
+			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
 	case NETDEV_DOWN:
-		list_for_each_safe(pos, temp, &sctp_local_addr_list) {
-			addr = list_entry(pos, struct sctp_sockaddr_entry, list);
-			if (ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr)) {
-				list_del(pos);
-				kfree(addr);
+		spin_lock_bh(&sctp_local_addr_lock);
+		list_for_each_entry_safe(addr, temp,
+					&sctp_local_addr_list, list) {
+			if (ipv6_addr_equal(&addr->a.v6.sin6_addr,
+					     &ifa->addr)) {
+				addr->valid = 0;
+				list_del_rcu(&addr->list);
 				break;
 			}
 		}
-
+		spin_unlock_bh(&sctp_local_addr_lock);
+		if (addr && !addr->valid)
+			call_rcu(&addr->rcu, sctp_local_addr_free);
 		break;
 	}
 
@@ -367,7 +379,9 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 			addr->a.v6.sin6_port = 0;
 			addr->a.v6.sin6_addr = ifp->addr;
 			addr->a.v6.sin6_scope_id = dev->ifindex;
+			addr->valid = 1;
 			INIT_LIST_HEAD(&addr->list);
+			INIT_RCU_HEAD(&addr->rcu);
 			list_add_tail(&addr->list, addrlist);
 		}
 	}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e98579b788b8..7ee120e85913 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -153,6 +153,9 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
 			addr->a.v4.sin_family = AF_INET;
 			addr->a.v4.sin_port = 0;
 			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
+			addr->valid = 1;
+			INIT_LIST_HEAD(&addr->list);
+			INIT_RCU_HEAD(&addr->rcu);
 			list_add_tail(&addr->list, addrlist);
 		}
 	}
@@ -192,16 +195,24 @@ static void sctp_free_local_addr_list(void)
 	}
 }
 
+void sctp_local_addr_free(struct rcu_head *head)
+{
+	struct sctp_sockaddr_entry *e = container_of(head,
+				struct sctp_sockaddr_entry, rcu);
+	kfree(e);
+}
+
 /* Copy the local addresses which are valid for 'scope' into 'bp'.  */
 int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
 			      gfp_t gfp, int copy_flags)
 {
 	struct sctp_sockaddr_entry *addr;
 	int error = 0;
-	struct list_head *pos, *temp;
 
-	list_for_each_safe(pos, temp, &sctp_local_addr_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	rcu_read_lock();
+	list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) {
+		if (!addr->valid)
+			continue;
 		if (sctp_in_scope(&addr->a, scope)) {
 			/* Now that the address is in scope, check to see if
 			 * the address type is really supported by the local
@@ -221,6 +232,7 @@ int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
 	}
 
 end_copy:
+	rcu_read_unlock();
 	return error;
 }
 
@@ -600,13 +612,18 @@ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
 	seq_printf(seq, "%d.%d.%d.%d ", NIPQUAD(addr->v4.sin_addr));
 }
 
-/* Event handler for inet address addition/deletion events.  */
+/* Event handler for inet address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
 static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			       void *ptr)
 {
 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
-	struct sctp_sockaddr_entry *addr;
-	struct list_head *pos, *temp;
+	struct sctp_sockaddr_entry *addr = NULL;
+	struct sctp_sockaddr_entry *temp;
 
 	switch (ev) {
 	case NETDEV_UP:
@@ -615,19 +632,25 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			addr->a.v4.sin_family = AF_INET;
 			addr->a.v4.sin_port = 0;
 			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
-			list_add_tail(&addr->list, &sctp_local_addr_list);
+			addr->valid = 1;
+			spin_lock_bh(&sctp_local_addr_lock);
+			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
 	case NETDEV_DOWN:
-		list_for_each_safe(pos, temp, &sctp_local_addr_list) {
-			addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+		spin_lock_bh(&sctp_local_addr_lock);
+		list_for_each_entry_safe(addr, temp,
+					&sctp_local_addr_list, list) {
 			if (addr->a.v4.sin_addr.s_addr == ifa->ifa_local) {
-				list_del(pos);
-				kfree(addr);
+				addr->valid = 0;
+				list_del_rcu(&addr->list);
 				break;
 			}
 		}
-
+		spin_unlock_bh(&sctp_local_addr_lock);
+		if (addr && !addr->valid)
+			call_rcu(&addr->rcu, sctp_local_addr_free);
 		break;
 	}
 
@@ -1160,6 +1183,7 @@ SCTP_STATIC __init int sctp_init(void)
 
 	/* Initialize the local address list. */
 	INIT_LIST_HEAD(&sctp_local_addr_list);
+	spin_lock_init(&sctp_local_addr_lock);
 	sctp_get_local_addr_list();
 
 	/* Register notifier for inet address additions/deletions. */
@@ -1227,6 +1251,9 @@ SCTP_STATIC __exit void sctp_exit(void)
 	sctp_v6_del_protocol();
 	inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
 
+	/* Unregister notifier for inet address additions/deletions. */
+	unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
+
 	/* Free the local address list.  */
 	sctp_free_local_addr_list();
 
@@ -1240,9 +1267,6 @@ SCTP_STATIC __exit void sctp_exit(void)
 	inet_unregister_protosw(&sctp_stream_protosw);
 	inet_unregister_protosw(&sctp_seqpacket_protosw);
 
-	/* Unregister notifier for inet address additions/deletions. */
-	unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
-
 	sctp_sysctl_unregister();
 	list_del(&sctp_ipv4_specific.list);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 33354602ae86..a3acf78d06ba 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4057,9 +4057,9 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 					       int __user *optlen)
 {
 	sctp_assoc_t id;
+	struct list_head *pos;
 	struct sctp_bind_addr *bp;
 	struct sctp_association *asoc;
-	struct list_head *pos, *temp;
 	struct sctp_sockaddr_entry *addr;
 	rwlock_t *addr_lock;
 	int cnt = 0;
@@ -4096,15 +4096,19 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 		addr = list_entry(bp->address_list.next,
 				  struct sctp_sockaddr_entry, list);
 		if (sctp_is_any(&addr->a)) {
-			list_for_each_safe(pos, temp, &sctp_local_addr_list) {
-				addr = list_entry(pos,
-						  struct sctp_sockaddr_entry,
-						  list);
+			rcu_read_lock();
+			list_for_each_entry_rcu(addr,
+						&sctp_local_addr_list, list) {
+				if (!addr->valid)
+					continue;
+
 				if ((PF_INET == sk->sk_family) &&
 				    (AF_INET6 == addr->a.sa.sa_family))
 					continue;
+
 				cnt++;
 			}
+			rcu_read_unlock();
 		} else {
 			cnt = 1;
 		}
@@ -4127,14 +4131,16 @@ static int sctp_copy_laddrs_old(struct sock *sk, __u16 port,
 					int max_addrs, void *to,
 					int *bytes_copied)
 {
-	struct list_head *pos, *next;
 	struct sctp_sockaddr_entry *addr;
 	union sctp_addr temp;
 	int cnt = 0;
 	int addrlen;
 
-	list_for_each_safe(pos, next, &sctp_local_addr_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	rcu_read_lock();
+	list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) {
+		if (!addr->valid)
+			continue;
+
 		if ((PF_INET == sk->sk_family) &&
 		    (AF_INET6 == addr->a.sa.sa_family))
 			continue;
@@ -4149,6 +4155,7 @@ static int sctp_copy_laddrs_old(struct sock *sk, __u16 port,
 		cnt ++;
 		if (cnt >= max_addrs) break;
 	}
+	rcu_read_unlock();
 
 	return cnt;
 }
@@ -4156,14 +4163,16 @@ static int sctp_copy_laddrs_old(struct sock *sk, __u16 port,
 static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
 			    size_t space_left, int *bytes_copied)
 {
-	struct list_head *pos, *next;
 	struct sctp_sockaddr_entry *addr;
 	union sctp_addr temp;
 	int cnt = 0;
 	int addrlen;
 
-	list_for_each_safe(pos, next, &sctp_local_addr_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	rcu_read_lock();
+	list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) {
+		if (!addr->valid)
+			continue;
+
 		if ((PF_INET == sk->sk_family) &&
 		    (AF_INET6 == addr->a.sa.sa_family))
 			continue;
@@ -4171,8 +4180,10 @@ static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
 		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
 								&temp);
 		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
-		if (space_left < addrlen)
-			return -ENOMEM;
+		if (space_left < addrlen) {
+			cnt =  -ENOMEM;
+			break;
+		}
 		memcpy(to, &temp, addrlen);
 
 		to += addrlen;
@@ -4180,6 +4191,7 @@ static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
 		space_left -= addrlen;
 		*bytes_copied += addrlen;
 	}
+	rcu_read_unlock();
 
 	return cnt;
 }
-- 
cgit v1.2.1


From 559cf710b07c5e2cfa3fb8d8f4a1320fd84c53f9 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sun, 16 Sep 2007 16:03:28 -0700
Subject: [SCTP]: Convert bind_addr_list locking to RCU

Since the sctp_sockaddr_entry is now RCU enabled as part of
the patch to synchronize sctp_localaddr_list, it makes sense to
change all handling of these entries to RCU.  This includes the
sctp_bind_addrs structure and it's list of bound addresses.

This list is currently protected by an external rw_lock and that
looks like an overkill.  There are only 2 writers to the list:
bind()/bindx() calls, and BH processing of ASCONF-ACK chunks.
These are already seriealized via the socket lock, so they will
not step on each other.  These are also relatively rare, so we
should be good with RCU.

The readers are varied and they are easily converted to RCU.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Sridhar Samdurala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  7 ++--
 net/sctp/associola.c       | 14 +------
 net/sctp/bind_addr.c       | 68 +++++++++++++++++++++-----------
 net/sctp/endpointola.c     | 27 ++++---------
 net/sctp/ipv6.c            | 12 +++---
 net/sctp/protocol.c        | 25 +++++-------
 net/sctp/sm_make_chunk.c   | 18 +++------
 net/sctp/socket.c          | 98 +++++++++++++---------------------------------
 8 files changed, 106 insertions(+), 163 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a89e36197afb..c2fe2dcc9afc 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1155,7 +1155,9 @@ int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
 			int flags);
 int sctp_add_bind_addr(struct sctp_bind_addr *, union sctp_addr *,
 		       __u8 use_as_src, gfp_t gfp);
-int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *);
+int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *,
+			void (*rcu_call)(struct rcu_head *,
+					  void (*func)(struct rcu_head *)));
 int sctp_bind_addr_match(struct sctp_bind_addr *, const union sctp_addr *,
 			 struct sctp_sock *);
 union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
@@ -1226,9 +1228,6 @@ struct sctp_ep_common {
 	 * bind_addr.address_list is our set of local IP addresses.
 	 */
 	struct sctp_bind_addr bind_addr;
-
-	/* Protection during address list comparisons. */
-	rwlock_t   addr_lock;
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 2ad1caf1ea42..9bad8ba0feda 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -99,7 +99,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
 	/* Initialize the bind addr area.  */
 	sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port);
-	rwlock_init(&asoc->base.addr_lock);
 
 	asoc->state = SCTP_STATE_CLOSED;
 
@@ -937,8 +936,6 @@ struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
 {
 	struct sctp_transport *transport;
 
-	sctp_read_lock(&asoc->base.addr_lock);
-
 	if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
 	    (htons(asoc->peer.port) == paddr->v4.sin_port)) {
 		transport = sctp_assoc_lookup_paddr(asoc, paddr);
@@ -952,7 +949,6 @@ struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
 	transport = NULL;
 
 out:
-	sctp_read_unlock(&asoc->base.addr_lock);
 	return transport;
 }
 
@@ -1376,19 +1372,13 @@ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
 int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
 			    const union sctp_addr *laddr)
 {
-	int found;
+	int found = 0;
 
-	sctp_read_lock(&asoc->base.addr_lock);
 	if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) &&
 	    sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
-				 sctp_sk(asoc->base.sk))) {
+				 sctp_sk(asoc->base.sk)))
 		found = 1;
-		goto out;
-	}
 
-	found = 0;
-out:
-	sctp_read_unlock(&asoc->base.addr_lock);
 	return found;
 }
 
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 7fc369f9035d..d35cbf5aae33 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -167,7 +167,11 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
 
 	INIT_LIST_HEAD(&addr->list);
 	INIT_RCU_HEAD(&addr->rcu);
-	list_add_tail(&addr->list, &bp->address_list);
+
+	/* We always hold a socket lock when calling this function,
+	 * and that acts as a writer synchronizing lock.
+	 */
+	list_add_tail_rcu(&addr->list, &bp->address_list);
 	SCTP_DBG_OBJCNT_INC(addr);
 
 	return 0;
@@ -176,23 +180,35 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
 /* Delete an address from the bind address list in the SCTP_bind_addr
  * structure.
  */
-int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
+int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr,
+			void (*rcu_call)(struct rcu_head *head,
+					 void (*func)(struct rcu_head *head)))
 {
-	struct list_head *pos, *temp;
-	struct sctp_sockaddr_entry *addr;
+	struct sctp_sockaddr_entry *addr, *temp;
 
-	list_for_each_safe(pos, temp, &bp->address_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	/* We hold the socket lock when calling this function,
+	 * and that acts as a writer synchronizing lock.
+	 */
+	list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
 		if (sctp_cmp_addr_exact(&addr->a, del_addr)) {
 			/* Found the exact match. */
-			list_del(pos);
-			kfree(addr);
-			SCTP_DBG_OBJCNT_DEC(addr);
-
-			return 0;
+			addr->valid = 0;
+			list_del_rcu(&addr->list);
+			break;
 		}
 	}
 
+	/* Call the rcu callback provided in the args.  This function is
+	 * called by both BH packet processing and user side socket option
+	 * processing, but it works on different lists in those 2 contexts.
+	 * Each context provides it's own callback, whether call_rcu_bh()
+	 * or call_rcu(), to make sure that we wait for an appropriate time.
+	 */
+	if (addr && !addr->valid) {
+		rcu_call(&addr->rcu, sctp_local_addr_free);
+		SCTP_DBG_OBJCNT_DEC(addr);
+	}
+
 	return -EINVAL;
 }
 
@@ -302,15 +318,20 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
 			 struct sctp_sock *opt)
 {
 	struct sctp_sockaddr_entry *laddr;
-	struct list_head *pos;
-
-	list_for_each(pos, &bp->address_list) {
-		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
-		if (opt->pf->cmp_addr(&laddr->a, addr, opt))
-			return 1;
+	int match = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
+		if (opt->pf->cmp_addr(&laddr->a, addr, opt)) {
+			match = 1;
+			break;
+		}
 	}
+	rcu_read_unlock();
 
-	return 0;
+	return match;
 }
 
 /* Find the first address in the bind address list that is not present in
@@ -325,18 +346,19 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 	union sctp_addr			*addr;
 	void 				*addr_buf;
 	struct sctp_af			*af;
-	struct list_head		*pos;
 	int				i;
 
-	list_for_each(pos, &bp->address_list) {
-		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
-
+	/* This is only called sctp_send_asconf_del_ip() and we hold
+	 * the socket lock in that code patch, so that address list
+	 * can't change.
+	 */
+	list_for_each_entry(laddr, &bp->address_list, list) {
 		addr_buf = (union sctp_addr *)addrs;
 		for (i = 0; i < addrcnt; i++) {
 			addr = (union sctp_addr *)addr_buf;
 			af = sctp_get_af_specific(addr->v4.sin_family);
 			if (!af)
-				return NULL;
+				break;
 
 			if (opt->pf->cmp_addr(&laddr->a, addr, opt))
 				break;
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 1404a9e2e78f..8f485a0d14bd 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -92,7 +92,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 
 	/* Initialize the bind addr area */
 	sctp_bind_addr_init(&ep->base.bind_addr, 0);
-	rwlock_init(&ep->base.addr_lock);
 
 	/* Remember who we are attached to.  */
 	ep->base.sk = sk;
@@ -225,21 +224,14 @@ void sctp_endpoint_put(struct sctp_endpoint *ep)
 struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
 					       const union sctp_addr *laddr)
 {
-	struct sctp_endpoint *retval;
+	struct sctp_endpoint *retval = NULL;
 
-	sctp_read_lock(&ep->base.addr_lock);
 	if (htons(ep->base.bind_addr.port) == laddr->v4.sin_port) {
 		if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
-					 sctp_sk(ep->base.sk))) {
+					 sctp_sk(ep->base.sk)))
 			retval = ep;
-			goto out;
-		}
 	}
 
-	retval = NULL;
-
-out:
-	sctp_read_unlock(&ep->base.addr_lock);
 	return retval;
 }
 
@@ -261,9 +253,7 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc(
 	list_for_each(pos, &ep->asocs) {
 		asoc = list_entry(pos, struct sctp_association, asocs);
 		if (rport == asoc->peer.port) {
-			sctp_read_lock(&asoc->base.addr_lock);
 			*transport = sctp_assoc_lookup_paddr(asoc, paddr);
-			sctp_read_unlock(&asoc->base.addr_lock);
 
 			if (*transport)
 				return asoc;
@@ -295,20 +285,17 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
 int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 				const union sctp_addr *paddr)
 {
-	struct list_head *pos;
 	struct sctp_sockaddr_entry *addr;
 	struct sctp_bind_addr *bp;
 
-	sctp_read_lock(&ep->base.addr_lock);
 	bp = &ep->base.bind_addr;
-	list_for_each(pos, &bp->address_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
-		if (sctp_has_association(&addr->a, paddr)) {
-			sctp_read_unlock(&ep->base.addr_lock);
+	/* This function is called with the socket lock held,
+	 * so the address_list can not change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
+		if (sctp_has_association(&addr->a, paddr))
 			return 1;
-		}
 	}
-	sctp_read_unlock(&ep->base.addr_lock);
 
 	return 0;
 }
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e12fa0a91da4..670fd2740b89 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -302,9 +302,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 			      union sctp_addr *saddr)
 {
 	struct sctp_bind_addr *bp;
-	rwlock_t *addr_lock;
 	struct sctp_sockaddr_entry *laddr;
-	struct list_head *pos;
 	sctp_scope_t scope;
 	union sctp_addr *baddr = NULL;
 	__u8 matchlen = 0;
@@ -324,14 +322,14 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 	scope = sctp_scope(daddr);
 
 	bp = &asoc->base.bind_addr;
-	addr_lock = &asoc->base.addr_lock;
 
 	/* Go through the bind address list and find the best source address
 	 * that matches the scope of the destination address.
 	 */
-	sctp_read_lock(addr_lock);
-	list_for_each(pos, &bp->address_list) {
-		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
 		if ((laddr->use_as_src) &&
 		    (laddr->a.sa.sa_family == AF_INET6) &&
 		    (scope <= sctp_scope(&laddr->a))) {
@@ -353,7 +351,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 		       __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr));
 	}
 
-	sctp_read_unlock(addr_lock);
+	rcu_read_unlock();
 }
 
 /* Make a copy of all potential local addresses. */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 7ee120e85913..3d036cdfae41 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -224,7 +224,7 @@ int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
 			      (copy_flags & SCTP_ADDR6_ALLOWED) &&
 			      (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
 				error = sctp_add_bind_addr(bp, &addr->a, 1,
-							   GFP_ATOMIC);
+						    GFP_ATOMIC);
 				if (error)
 					goto end_copy;
 			}
@@ -428,9 +428,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 	struct rtable *rt;
 	struct flowi fl;
 	struct sctp_bind_addr *bp;
-	rwlock_t *addr_lock;
 	struct sctp_sockaddr_entry *laddr;
-	struct list_head *pos;
 	struct dst_entry *dst = NULL;
 	union sctp_addr dst_saddr;
 
@@ -459,23 +457,20 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 		goto out;
 
 	bp = &asoc->base.bind_addr;
-	addr_lock = &asoc->base.addr_lock;
 
 	if (dst) {
 		/* Walk through the bind address list and look for a bind
 		 * address that matches the source address of the returned dst.
 		 */
-		sctp_read_lock(addr_lock);
-		list_for_each(pos, &bp->address_list) {
-			laddr = list_entry(pos, struct sctp_sockaddr_entry,
-					   list);
-			if (!laddr->use_as_src)
+		rcu_read_lock();
+		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+			if (!laddr->valid || !laddr->use_as_src)
 				continue;
 			sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
 			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
 				goto out_unlock;
 		}
-		sctp_read_unlock(addr_lock);
+		rcu_read_unlock();
 
 		/* None of the bound addresses match the source address of the
 		 * dst. So release it.
@@ -487,10 +482,10 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 	/* Walk through the bind address list and try to get a dst that
 	 * matches a bind address as the source address.
 	 */
-	sctp_read_lock(addr_lock);
-	list_for_each(pos, &bp->address_list) {
-		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
-
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
 		if ((laddr->use_as_src) &&
 		    (AF_INET == laddr->a.sa.sa_family)) {
 			fl.fl4_src = laddr->a.v4.sin_addr.s_addr;
@@ -502,7 +497,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 	}
 
 out_unlock:
-	sctp_read_unlock(addr_lock);
+	rcu_read_unlock();
 out:
 	if (dst)
 		SCTP_DEBUG_PRINTK("rt_dst:%u.%u.%u.%u, rt_src:%u.%u.%u.%u\n",
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 79856c924525..2e34220d94cd 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1531,7 +1531,7 @@ no_hmac:
 	/* Also, add the destination address. */
 	if (list_empty(&retval->base.bind_addr.address_list)) {
 		sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, 1,
-				   GFP_ATOMIC);
+				GFP_ATOMIC);
 	}
 
 	retval->next_tsn = retval->c.initial_tsn;
@@ -2613,22 +2613,16 @@ static int sctp_asconf_param_success(struct sctp_association *asoc,
 
 	switch (asconf_param->param_hdr.type) {
 	case SCTP_PARAM_ADD_IP:
-		sctp_local_bh_disable();
-		sctp_write_lock(&asoc->base.addr_lock);
-		list_for_each(pos, &bp->address_list) {
-			saddr = list_entry(pos, struct sctp_sockaddr_entry, list);
+		/* This is always done in BH context with a socket lock
+		 * held, so the list can not change.
+		 */
+		list_for_each_entry(saddr, &bp->address_list, list) {
 			if (sctp_cmp_addr_exact(&saddr->a, &addr))
 				saddr->use_as_src = 1;
 		}
-		sctp_write_unlock(&asoc->base.addr_lock);
-		sctp_local_bh_enable();
 		break;
 	case SCTP_PARAM_DEL_IP:
-		sctp_local_bh_disable();
-		sctp_write_lock(&asoc->base.addr_lock);
-		retval = sctp_del_bind_addr(bp, &addr);
-		sctp_write_unlock(&asoc->base.addr_lock);
-		sctp_local_bh_enable();
+		retval = sctp_del_bind_addr(bp, &addr, call_rcu_bh);
 		list_for_each(pos, &asoc->peer.transport_addr_list) {
 			transport = list_entry(pos, struct sctp_transport,
 						 transports);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a3acf78d06ba..772fbfb4bfda 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -367,14 +367,10 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 	if (!bp->port)
 		bp->port = inet_sk(sk)->num;
 
-	/* Add the address to the bind address list.  */
-	sctp_local_bh_disable();
-	sctp_write_lock(&ep->base.addr_lock);
-
-	/* Use GFP_ATOMIC since BHs are disabled.  */
+	/* Add the address to the bind address list.
+	 * Use GFP_ATOMIC since BHs will be disabled.
+	 */
 	ret = sctp_add_bind_addr(bp, addr, 1, GFP_ATOMIC);
-	sctp_write_unlock(&ep->base.addr_lock);
-	sctp_local_bh_enable();
 
 	/* Copy back into socket for getsockname() use. */
 	if (!ret) {
@@ -544,15 +540,12 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 		if (i < addrcnt)
 			continue;
 
-		/* Use the first address in bind addr list of association as
-		 * Address Parameter of ASCONF CHUNK.
+		/* Use the first valid address in bind addr list of
+		 * association as Address Parameter of ASCONF CHUNK.
 		 */
-		sctp_read_lock(&asoc->base.addr_lock);
 		bp = &asoc->base.bind_addr;
 		p = bp->address_list.next;
 		laddr = list_entry(p, struct sctp_sockaddr_entry, list);
-		sctp_read_unlock(&asoc->base.addr_lock);
-
 		chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs,
 						   addrcnt, SCTP_PARAM_ADD_IP);
 		if (!chunk) {
@@ -567,8 +560,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 		/* Add the new addresses to the bind address list with
 		 * use_as_src set to 0.
 		 */
-		sctp_local_bh_disable();
-		sctp_write_lock(&asoc->base.addr_lock);
 		addr_buf = addrs;
 		for (i = 0; i < addrcnt; i++) {
 			addr = (union sctp_addr *)addr_buf;
@@ -578,8 +569,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 						    GFP_ATOMIC);
 			addr_buf += af->sockaddr_len;
 		}
-		sctp_write_unlock(&asoc->base.addr_lock);
-		sctp_local_bh_enable();
 	}
 
 out:
@@ -651,13 +640,7 @@ static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt)
 		 * socket routing and failover schemes. Refer to comments in
 		 * sctp_do_bind(). -daisy
 		 */
-		sctp_local_bh_disable();
-		sctp_write_lock(&ep->base.addr_lock);
-
-		retval = sctp_del_bind_addr(bp, sa_addr);
-
-		sctp_write_unlock(&ep->base.addr_lock);
-		sctp_local_bh_enable();
+		retval = sctp_del_bind_addr(bp, sa_addr, call_rcu);
 
 		addr_buf += af->sockaddr_len;
 err_bindx_rem:
@@ -748,14 +731,16 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 		 * make sure that we do not delete all the addresses in the
 		 * association.
 		 */
-		sctp_read_lock(&asoc->base.addr_lock);
 		bp = &asoc->base.bind_addr;
 		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
 					       addrcnt, sp);
-		sctp_read_unlock(&asoc->base.addr_lock);
 		if (!laddr)
 			continue;
 
+		/* We do not need RCU protection throughout this loop
+		 * because this is done under a socket lock from the
+		 * setsockopt call.
+		 */
 		chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt,
 						   SCTP_PARAM_DEL_IP);
 		if (!chunk) {
@@ -766,23 +751,16 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 		/* Reset use_as_src flag for the addresses in the bind address
 		 * list that are to be deleted.
 		 */
-		sctp_local_bh_disable();
-		sctp_write_lock(&asoc->base.addr_lock);
 		addr_buf = addrs;
 		for (i = 0; i < addrcnt; i++) {
 			laddr = (union sctp_addr *)addr_buf;
 			af = sctp_get_af_specific(laddr->v4.sin_family);
-			list_for_each(pos1, &bp->address_list) {
-				saddr = list_entry(pos1,
-						   struct sctp_sockaddr_entry,
-						   list);
+			list_for_each_entry(saddr, &bp->address_list, list) {
 				if (sctp_cmp_addr_exact(&saddr->a, laddr))
 					saddr->use_as_src = 0;
 			}
 			addr_buf += af->sockaddr_len;
 		}
-		sctp_write_unlock(&asoc->base.addr_lock);
-		sctp_local_bh_enable();
 
 		/* Update the route and saddr entries for all the transports
 		 * as some of the addresses in the bind address list are
@@ -4057,11 +4035,9 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 					       int __user *optlen)
 {
 	sctp_assoc_t id;
-	struct list_head *pos;
 	struct sctp_bind_addr *bp;
 	struct sctp_association *asoc;
 	struct sctp_sockaddr_entry *addr;
-	rwlock_t *addr_lock;
 	int cnt = 0;
 
 	if (len < sizeof(sctp_assoc_t))
@@ -4078,17 +4054,13 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 	 */
 	if (0 == id) {
 		bp = &sctp_sk(sk)->ep->base.bind_addr;
-		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
 	} else {
 		asoc = sctp_id2assoc(sk, id);
 		if (!asoc)
 			return -EINVAL;
 		bp = &asoc->base.bind_addr;
-		addr_lock = &asoc->base.addr_lock;
 	}
 
-	sctp_read_lock(addr_lock);
-
 	/* If the endpoint is bound to 0.0.0.0 or ::0, count the valid
 	 * addresses from the global local address list.
 	 */
@@ -4115,12 +4087,14 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
 		goto done;
 	}
 
-	list_for_each(pos, &bp->address_list) {
+	/* Protection on the bound address list is not needed,
+	 * since in the socket option context we hold the socket lock,
+	 * so there is no way that the bound address list can change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
 		cnt ++;
 	}
-
 done:
-	sctp_read_unlock(addr_lock);
 	return cnt;
 }
 
@@ -4204,7 +4178,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 {
 	struct sctp_bind_addr *bp;
 	struct sctp_association *asoc;
-	struct list_head *pos;
 	int cnt = 0;
 	struct sctp_getaddrs_old getaddrs;
 	struct sctp_sockaddr_entry *addr;
@@ -4212,7 +4185,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	union sctp_addr temp;
 	struct sctp_sock *sp = sctp_sk(sk);
 	int addrlen;
-	rwlock_t *addr_lock;
 	int err = 0;
 	void *addrs;
 	void *buf;
@@ -4234,13 +4206,11 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	 */
 	if (0 == getaddrs.assoc_id) {
 		bp = &sctp_sk(sk)->ep->base.bind_addr;
-		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
 	} else {
 		asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
 		if (!asoc)
 			return -EINVAL;
 		bp = &asoc->base.bind_addr;
-		addr_lock = &asoc->base.addr_lock;
 	}
 
 	to = getaddrs.addrs;
@@ -4254,8 +4224,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	if (!addrs)
 		return -ENOMEM;
 
-	sctp_read_lock(addr_lock);
-
 	/* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
 	 * addresses from the global local address list.
 	 */
@@ -4271,8 +4239,11 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	}
 
 	buf = addrs;
-	list_for_each(pos, &bp->address_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	/* Protection on the bound address list is not needed since
+	 * in the socket option context we hold a socket lock and
+	 * thus the bound address list can't change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
 		memcpy(&temp, &addr->a, sizeof(temp));
 		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
 		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
@@ -4284,8 +4255,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
 	}
 
 copy_getaddrs:
-	sctp_read_unlock(addr_lock);
-
 	/* copy the entire address list into the user provided space */
 	if (copy_to_user(to, addrs, bytes_copied)) {
 		err = -EFAULT;
@@ -4307,7 +4276,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 {
 	struct sctp_bind_addr *bp;
 	struct sctp_association *asoc;
-	struct list_head *pos;
 	int cnt = 0;
 	struct sctp_getaddrs getaddrs;
 	struct sctp_sockaddr_entry *addr;
@@ -4315,7 +4283,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	union sctp_addr temp;
 	struct sctp_sock *sp = sctp_sk(sk);
 	int addrlen;
-	rwlock_t *addr_lock;
 	int err = 0;
 	size_t space_left;
 	int bytes_copied = 0;
@@ -4336,13 +4303,11 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	 */
 	if (0 == getaddrs.assoc_id) {
 		bp = &sctp_sk(sk)->ep->base.bind_addr;
-		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
 	} else {
 		asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
 		if (!asoc)
 			return -EINVAL;
 		bp = &asoc->base.bind_addr;
-		addr_lock = &asoc->base.addr_lock;
 	}
 
 	to = optval + offsetof(struct sctp_getaddrs,addrs);
@@ -4352,8 +4317,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	if (!addrs)
 		return -ENOMEM;
 
-	sctp_read_lock(addr_lock);
-
 	/* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
 	 * addresses from the global local address list.
 	 */
@@ -4365,21 +4328,24 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 						space_left, &bytes_copied);
 			if (cnt < 0) {
 				err = cnt;
-				goto error_lock;
+				goto out;
 			}
 			goto copy_getaddrs;
 		}
 	}
 
 	buf = addrs;
-	list_for_each(pos, &bp->address_list) {
-		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+	/* Protection on the bound address list is not needed since
+	 * in the socket option context we hold a socket lock and
+	 * thus the bound address list can't change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
 		memcpy(&temp, &addr->a, sizeof(temp));
 		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
 		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
 		if (space_left < addrlen) {
 			err =  -ENOMEM; /*fixme: right error?*/
-			goto error_lock;
+			goto out;
 		}
 		memcpy(buf, &temp, addrlen);
 		buf += addrlen;
@@ -4389,8 +4355,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	}
 
 copy_getaddrs:
-	sctp_read_unlock(addr_lock);
-
 	if (copy_to_user(to, addrs, bytes_copied)) {
 		err = -EFAULT;
 		goto out;
@@ -4401,12 +4365,6 @@ copy_getaddrs:
 	}
 	if (put_user(bytes_copied, optlen))
 		err = -EFAULT;
-
-	goto out;
-
-error_lock:
-	sctp_read_unlock(addr_lock);
-
 out:
 	kfree(addrs);
 	return err;
-- 
cgit v1.2.1


From 31bac44468257986484703cc09da8a9dcae88a36 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:19:20 -0700
Subject: [PPP] pppoe: Fix skb_unshare_check call position

The skb_unshare_check call needs to be made before pskb_may_pull,
not after.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 68631a5721ac..5ac3eff6a2a6 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -385,12 +385,12 @@ static int pppoe_rcv(struct sk_buff *skb,
 	struct pppoe_hdr *ph;
 	struct pppox_sock *po;
 
-	if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
-		goto drop;
-
 	if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
 		goto out;
 
+	if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
+		goto drop;
+
 	ph = pppoe_hdr(skb);
 
 	po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
-- 
cgit v1.2.1


From db7bf6d97c6956b7eb0f22131cb5c37bd41f33c0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:19:50 -0700
Subject: [PPP] pppoe: Fix data clobbering in __pppoe_xmit and return value

The function __pppoe_xmit modifies the skb data and therefore it needs
to copy and skb data if it's cloned.

In fact, it currently allocates a new skb so that it can return 0 in
case of error without freeing the original skb.  This is totally wrong
because returning zero is meant to indicate congestion whereupon pppoe
is supposed to wake up the upper layer once the congestion subsides.

This makes sense for ppp_async and ppp_sync but is out-of-place for
pppoe.  This patch makes it always return 1 and free the skb.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c | 50 +++++++++++++-------------------------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 5ac3eff6a2a6..8818253102f2 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -850,9 +850,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	struct net_device *dev = po->pppoe_dev;
 	struct pppoe_hdr hdr;
 	struct pppoe_hdr *ph;
-	int headroom = skb_headroom(skb);
 	int data_len = skb->len;
-	struct sk_buff *skb2;
 
 	if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
 		goto abort;
@@ -866,53 +864,31 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	if (!dev)
 		goto abort;
 
-	/* Copy the skb if there is no space for the header. */
-	if (headroom < (sizeof(struct pppoe_hdr) + dev->hard_header_len)) {
-		skb2 = dev_alloc_skb(32+skb->len +
-				     sizeof(struct pppoe_hdr) +
-				     dev->hard_header_len);
-
-		if (skb2 == NULL)
-			goto abort;
-
-		skb_reserve(skb2, dev->hard_header_len + sizeof(struct pppoe_hdr));
-		skb_copy_from_linear_data(skb, skb_put(skb2, skb->len),
-					  skb->len);
-	} else {
-		/* Make a clone so as to not disturb the original skb,
-		 * give dev_queue_xmit something it can free.
-		 */
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-
-		if (skb2 == NULL)
-			goto abort;
-	}
+	/* Copy the data if there is no space for the header or if it's
+	 * read-only.
+	 */
+	if (skb_cow(skb, sizeof(*ph) + dev->hard_header_len))
+		goto abort;
 
-	ph = (struct pppoe_hdr *) skb_push(skb2, sizeof(struct pppoe_hdr));
+	ph = (struct pppoe_hdr *) skb_push(skb, sizeof(struct pppoe_hdr));
 	memcpy(ph, &hdr, sizeof(struct pppoe_hdr));
-	skb2->protocol = __constant_htons(ETH_P_PPP_SES);
+	skb->protocol = __constant_htons(ETH_P_PPP_SES);
 
-	skb_reset_network_header(skb2);
+	skb_reset_network_header(skb);
 
-	skb2->dev = dev;
+	skb->dev = dev;
 
-	dev->hard_header(skb2, dev, ETH_P_PPP_SES,
+	dev->hard_header(skb, dev, ETH_P_PPP_SES,
 			 po->pppoe_pa.remote, NULL, data_len);
 
-	/* We're transmitting skb2, and assuming that dev_queue_xmit
-	 * will free it.  The generic ppp layer however, is expecting
-	 * that we give back 'skb' (not 'skb2') in case of failure,
-	 * but free it in case of success.
-	 */
-
-	if (dev_queue_xmit(skb2) < 0)
+	if (dev_queue_xmit(skb) < 0)
 		goto abort;
 
-	kfree_skb(skb);
 	return 1;
 
 abort:
-	return 0;
+	kfree_skb(skb);
+	return 1;
 }
 
 
-- 
cgit v1.2.1


From 9355ec23397af32799038d0e8edbfa5b6f425c27 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:20:21 -0700
Subject: [PPP] pppoe: Fill in header directly in __pppoe_xmit

This patch removes the hdr variable (which is copied into the skb)
and instead sets the header directly in the skb.

It also uses __skb_push instead of skb_push since we've just checked
using skb_cow for enough head room.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 8818253102f2..bac36546e0bf 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -848,19 +848,12 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 {
 	struct pppox_sock *po = pppox_sk(sk);
 	struct net_device *dev = po->pppoe_dev;
-	struct pppoe_hdr hdr;
 	struct pppoe_hdr *ph;
 	int data_len = skb->len;
 
 	if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
 		goto abort;
 
-	hdr.ver	= 1;
-	hdr.type = 1;
-	hdr.code = 0;
-	hdr.sid	= po->num;
-	hdr.length = htons(skb->len);
-
 	if (!dev)
 		goto abort;
 
@@ -870,12 +863,17 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	if (skb_cow(skb, sizeof(*ph) + dev->hard_header_len))
 		goto abort;
 
-	ph = (struct pppoe_hdr *) skb_push(skb, sizeof(struct pppoe_hdr));
-	memcpy(ph, &hdr, sizeof(struct pppoe_hdr));
-	skb->protocol = __constant_htons(ETH_P_PPP_SES);
-
+	__skb_push(skb, sizeof(*ph));
 	skb_reset_network_header(skb);
 
+	ph = pppoe_hdr(skb);
+	ph->ver	= 1;
+	ph->type = 1;
+	ph->code = 0;
+	ph->sid	= po->num;
+	ph->length = htons(data_len);
+
+	skb->protocol = __constant_htons(ETH_P_PPP_SES);
 	skb->dev = dev;
 
 	dev->hard_header(skb, dev, ETH_P_PPP_SES,
-- 
cgit v1.2.1


From e081e1e3ef4682802ac63b1e5e26158fb9ca9e90 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:20:48 -0700
Subject: [BRIDGE]: Kill clone argument to br_flood_*

The clone argument is only used by one caller and that caller can clone
the packet itself.  This patch moves the clone call into the caller and
kills the clone argument.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  |  4 ++--
 net/bridge/br_forward.c | 21 +++++----------------
 net/bridge/br_input.c   | 48 ++++++++++++++++++++++--------------------------
 net/bridge/br_private.h |  8 ++------
 4 files changed, 31 insertions(+), 50 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 0eded176ce99..99292e8e1d0f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -41,11 +41,11 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_pull(skb, ETH_HLEN);
 
 	if (dest[0] & 1)
-		br_flood_deliver(br, skb, 0);
+		br_flood_deliver(br, skb);
 	else if ((dst = __br_fdb_get(br, dest)) != NULL)
 		br_deliver(dst->dst, skb);
 	else
-		br_flood_deliver(br, skb, 0);
+		br_flood_deliver(br, skb);
 
 	return 0;
 }
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ada7f495445c..bdd7c35c3c7b 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -100,24 +100,13 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
 }
 
 /* called under bridge lock */
-static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone,
+static void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	void (*__packet_hook)(const struct net_bridge_port *p,
 			      struct sk_buff *skb))
 {
 	struct net_bridge_port *p;
 	struct net_bridge_port *prev;
 
-	if (clone) {
-		struct sk_buff *skb2;
-
-		if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
-			br->statistics.tx_dropped++;
-			return;
-		}
-
-		skb = skb2;
-	}
-
 	prev = NULL;
 
 	list_for_each_entry_rcu(p, &br->port_list, list) {
@@ -148,13 +137,13 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone,
 
 
 /* called with rcu_read_lock */
-void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, int clone)
+void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
 {
-	br_flood(br, skb, clone, __br_deliver);
+	br_flood(br, skb, __br_deliver);
 }
 
 /* called under bridge lock */
-void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, int clone)
+void br_flood_forward(struct net_bridge *br, struct sk_buff *skb)
 {
-	br_flood(br, skb, clone, __br_forward);
+	br_flood(br, skb, __br_forward);
 }
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 6f468fc3357a..3a8a015c92e0 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -43,7 +43,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
 	struct net_bridge *br;
 	struct net_bridge_fdb_entry *dst;
-	int passedup = 0;
+	struct sk_buff *skb2;
 
 	if (!p || p->state == BR_STATE_DISABLED)
 		goto drop;
@@ -55,39 +55,35 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	if (p->state == BR_STATE_LEARNING)
 		goto drop;
 
-	if (br->dev->flags & IFF_PROMISC) {
-		struct sk_buff *skb2;
+	/* The packet skb2 goes to the local host (NULL to skip). */
+	skb2 = NULL;
 
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-		if (skb2 != NULL) {
-			passedup = 1;
-			br_pass_frame_up(br, skb2);
-		}
-	}
+	if (br->dev->flags & IFF_PROMISC)
+		skb2 = skb;
+
+	dst = NULL;
 
 	if (is_multicast_ether_addr(dest)) {
 		br->statistics.multicast++;
-		br_flood_forward(br, skb, !passedup);
-		if (!passedup)
-			br_pass_frame_up(br, skb);
-		goto out;
+		skb2 = skb;
+	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+		skb2 = skb;
+		/* Do not forward the packet since it's local. */
+		skb = NULL;
 	}
 
-	dst = __br_fdb_get(br, dest);
-	if (dst != NULL && dst->is_local) {
-		if (!passedup)
-			br_pass_frame_up(br, skb);
-		else
-			kfree_skb(skb);
-		goto out;
-	}
+	if (skb2 == skb)
+		skb2 = skb_clone(skb, GFP_ATOMIC);
 
-	if (dst != NULL) {
-		br_forward(dst->dst, skb);
-		goto out;
-	}
+	if (skb2)
+		br_pass_frame_up(br, skb2);
 
-	br_flood_forward(br, skb, 0);
+	if (skb) {
+		if (dst)
+			br_forward(dst->dst, skb);
+		else
+			br_flood_forward(br, skb);
+	}
 
 out:
 	return 0;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 21bf3a9a03fd..e6dc6f52990d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -170,12 +170,8 @@ extern int br_dev_queue_push_xmit(struct sk_buff *skb);
 extern void br_forward(const struct net_bridge_port *to,
 		struct sk_buff *skb);
 extern int br_forward_finish(struct sk_buff *skb);
-extern void br_flood_deliver(struct net_bridge *br,
-		      struct sk_buff *skb,
-		      int clone);
-extern void br_flood_forward(struct net_bridge *br,
-		      struct sk_buff *skb,
-		      int clone);
+extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
+extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
 
 /* br_if.c */
 extern void br_port_carrier_check(struct net_bridge_port *p);
-- 
cgit v1.2.1


From d9cc20484e5e48c6a5deb4387c20fd45bfbdde8c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:21:16 -0700
Subject: [NET] skbuff: Add skb_cow_head

This patch adds an optimised version of skb_cow that avoids the copy if
the header can be modified even if the rest of the payload is cloned.

This can be used in encapsulating paths where we only need to modify the
header.  As it is, this can be used in PPPOE and bridging.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c       |  2 +-
 include/linux/skbuff.h    | 40 +++++++++++++++++++++++++++++++---------
 net/bridge/br_netfilter.c |  2 +-
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index bac36546e0bf..0d7f570b9a54 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -860,7 +860,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	/* Copy the data if there is no space for the header or if it's
 	 * read-only.
 	 */
-	if (skb_cow(skb, sizeof(*ph) + dev->hard_header_len))
+	if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len))
 		goto abort;
 
 	__skb_push(skb, sizeof(*ph));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 93c27f71122a..a656cecd373c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1352,6 +1352,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
 	       skb_headroom(skb) + len <= skb->hdr_len;
 }
 
+static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
+			    int cloned)
+{
+	int delta = 0;
+
+	if (headroom < NET_SKB_PAD)
+		headroom = NET_SKB_PAD;
+	if (headroom > skb_headroom(skb))
+		delta = headroom - skb_headroom(skb);
+
+	if (delta || cloned)
+		return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
+					GFP_ATOMIC);
+	return 0;
+}
+
 /**
  *	skb_cow - copy header of skb when it is required
  *	@skb: buffer to cow
@@ -1366,16 +1382,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
  */
 static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
 {
-	int delta = (headroom > NET_SKB_PAD ? headroom : NET_SKB_PAD) -
-			skb_headroom(skb);
-
-	if (delta < 0)
-		delta = 0;
+	return __skb_cow(skb, headroom, skb_cloned(skb));
+}
 
-	if (delta || skb_cloned(skb))
-		return pskb_expand_head(skb, (delta + (NET_SKB_PAD-1)) &
-				~(NET_SKB_PAD-1), 0, GFP_ATOMIC);
-	return 0;
+/**
+ *	skb_cow_head - skb_cow but only making the head writable
+ *	@skb: buffer to cow
+ *	@headroom: needed headroom
+ *
+ *	This function is identical to skb_cow except that we replace the
+ *	skb_cloned check by skb_header_cloned.  It should be used when
+ *	you only need to push on some header and do not need to modify
+ *	the data.
+ */
+static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
+{
+	return __skb_cow(skb, headroom, skb_header_cloned(skb));
 }
 
 /**
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3ee2022928e3..fc13130035e7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -183,7 +183,7 @@ int nf_bridge_copy_header(struct sk_buff *skb)
 	int err;
 	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
 
-	err = skb_cow(skb, header_size);
+	err = skb_cow_head(skb, header_size);
 	if (err)
 		return err;
 
-- 
cgit v1.2.1


From 7b797d5b150775d717cb03b5ada28b8bad99afab Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:21:42 -0700
Subject: [PPP] generic: Call skb_cow_head before scribbling over skb

It's rude to write over data that other people are still using.  So call
skb_cow_head before PPP proceeds to modify the skb data.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_generic.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 9293c82ef2af..7e21342becb2 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -899,17 +899,9 @@ ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/* Put the 2-byte PPP protocol number on the front,
 	   making sure there is room for the address and control fields. */
-	if (skb_headroom(skb) < PPP_HDRLEN) {
-		struct sk_buff *ns;
-
-		ns = alloc_skb(skb->len + dev->hard_header_len, GFP_ATOMIC);
-		if (ns == 0)
-			goto outf;
-		skb_reserve(ns, dev->hard_header_len);
-		skb_copy_bits(skb, 0, skb_put(ns, skb->len), skb->len);
-		kfree_skb(skb);
-		skb = ns;
-	}
+	if (skb_cow_head(skb, PPP_HDRLEN))
+		goto outf;
+
 	pp = skb_push(skb, 2);
 	proto = npindex_to_proto[npi];
 	pp[0] = proto >> 8;
-- 
cgit v1.2.1


From 2a38b775b77f99308a4e571c13d908df78ac5e57 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:22:13 -0700
Subject: [PPP] generic: Fix receive path data clobbering & non-linear handling

This patch adds missing pskb_may_pull calls to deal with non-linear
packets that may arrive from pppoe or pppol2tp.

It also copies cloned packets before writing over them.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_generic.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 7e21342becb2..4b49d0e8c7eb 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1525,7 +1525,7 @@ ppp_input_error(struct ppp_channel *chan, int code)
 static void
 ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
 {
-	if (skb->len >= 2) {
+	if (pskb_may_pull(skb, 2)) {
 #ifdef CONFIG_PPP_MULTILINK
 		/* XXX do channel-level decompression here */
 		if (PPP_PROTO(skb) == PPP_MP)
@@ -1577,7 +1577,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 		if (ppp->vj == 0 || (ppp->flags & SC_REJ_COMP_TCP))
 			goto err;
 
-		if (skb_tailroom(skb) < 124) {
+		if (skb_tailroom(skb) < 124 || skb_cloned(skb)) {
 			/* copy to a new sk_buff with more tailroom */
 			ns = dev_alloc_skb(skb->len + 128);
 			if (ns == 0) {
@@ -1648,23 +1648,29 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 		/* check if the packet passes the pass and active filters */
 		/* the filter instructions are constructed assuming
 		   a four-byte PPP header on each packet */
-		*skb_push(skb, 2) = 0;
-		if (ppp->pass_filter
-		    && sk_run_filter(skb, ppp->pass_filter,
-				     ppp->pass_len) == 0) {
-			if (ppp->debug & 1)
-				printk(KERN_DEBUG "PPP: inbound frame not passed\n");
-			kfree_skb(skb);
-			return;
-		}
-		if (!(ppp->active_filter
-		      && sk_run_filter(skb, ppp->active_filter,
-				       ppp->active_len) == 0))
-			ppp->last_recv = jiffies;
-		skb_pull(skb, 2);
-#else
-		ppp->last_recv = jiffies;
+		if (ppp->pass_filter || ppp->active_filter) {
+			if (skb_cloned(skb) &&
+			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+				goto err;
+
+			*skb_push(skb, 2) = 0;
+			if (ppp->pass_filter
+			    && sk_run_filter(skb, ppp->pass_filter,
+					     ppp->pass_len) == 0) {
+				if (ppp->debug & 1)
+					printk(KERN_DEBUG "PPP: inbound frame "
+					       "not passed\n");
+				kfree_skb(skb);
+				return;
+			}
+			if (!(ppp->active_filter
+			      && sk_run_filter(skb, ppp->active_filter,
+					       ppp->active_len) == 0))
+				ppp->last_recv = jiffies;
+			__skb_pull(skb, 2);
+		} else
 #endif /* CONFIG_PPP_FILTER */
+			ppp->last_recv = jiffies;
 
 		if ((ppp->dev->flags & IFF_UP) == 0
 		    || ppp->npmode[npi] != NPMODE_PASS) {
@@ -1762,7 +1768,7 @@ ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
 	struct channel *ch;
 	int mphdrlen = (ppp->flags & SC_MP_SHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
 
-	if (!pskb_may_pull(skb, mphdrlen) || ppp->mrru == 0)
+	if (!pskb_may_pull(skb, mphdrlen + 1) || ppp->mrru == 0)
 		goto err;		/* no good, throw it away */
 
 	/* Decode sequence number and begin/end bits */
-- 
cgit v1.2.1


From d9f30ec0b0d129b9cbf2b041a6a3159aa24592f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Sep 2007 16:43:04 -0700
Subject: [VLAN]: Fix net_device leak.

In "[VLAN]: Move device registation to seperate function" (commit
e89fe42cd03c8fd3686df82d8390a235717a66de), a pile of code got moved
to register_vlan_dev(), including grabbing a reference to underlying
device.  However, original dev_hold() had been left behind, so we
leak a reference to net_device now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1583c5ef963f..2a546919d6fb 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -562,8 +562,6 @@ static int register_vlan_device(struct net_device *real_dev,
 	if (err < 0)
 		goto out_free_newdev;
 
-	/* Account for reference in struct vlan_dev_info */
-	dev_hold(real_dev);
 #ifdef VLAN_DEBUG
 	printk(VLAN_DBG "Allocated new device successfully, returning.\n");
 #endif
-- 
cgit v1.2.1