From c1da4ac752b8b0411791d26c678fcf23d2eed242 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Fri, 13 Jun 2008 18:12:00 -0700
Subject: net/core: add NETDEV_BONDING_FAILOVER event

Add NETDEV_BONDING_FAILOVER event to be used in a successive patch
by bonding to announce fail-over for the active-backup mode through the
netdev events notifier chain mechanism. Such an event can be of use for the
RDMA CM (communication manager) to let native RDMA ULPs (eg NFS-RDMA, iSER)
always be aligned with the IP stack, in the sense that they use the same
ports/links as the stack does. More usages can be done to allow monitoring
tools based on netlink events being aware to bonding fail-over.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 68d8df0992ab..0e45742e7158 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -961,6 +961,12 @@ void netdev_state_change(struct net_device *dev)
 	}
 }
 
+void netdev_bonding_change(struct net_device *dev)
+{
+	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
+}
+EXPORT_SYMBOL(netdev_bonding_change);
+
 /**
  *	dev_load 	- load a network module
  *	@net: the applicable net namespace
-- 
cgit v1.2.1


From dad9b335c6940de2746a9788eb456d09cf102f81 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Wed, 18 Jun 2008 01:48:28 -0700
Subject: netdevice: Fix promiscuity and allmulti overflow

Max of promiscuity and allmulti plus positive @inc can cause overflow.
Fox example: when allmulti=0xFFFFFFFF, any caller give dev_set_allmulti() a
positive @inc will cause allmulti be off.
This is not what we want, though it's rare case.
The fix is that only negative @inc will cause allmulti or promiscuity be off
and when any caller makes the counters touch the roof, we return error.

Change of v2:
Change void function dev_set_promiscuity/allmulti to return int.
So callers can get the overflow error.
Caller's fix will be done later.

Change of v3:
1. Since we return error to caller, we don't need to print KERN_ERROR,
KERN_WARNING is enough.
2. In dev_set_promiscuity(), if __dev_set_promiscuity() failed, we
return at once.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 55 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0e45742e7158..a495f712d38c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2771,16 +2771,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
 	return 0;
 }
 
-static void __dev_set_promiscuity(struct net_device *dev, int inc)
+static int __dev_set_promiscuity(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
 	ASSERT_RTNL();
 
-	if ((dev->promiscuity += inc) == 0)
-		dev->flags &= ~IFF_PROMISC;
-	else
-		dev->flags |= IFF_PROMISC;
+	dev->flags |= IFF_PROMISC;
+	dev->promiscuity += inc;
+	if (dev->promiscuity == 0) {
+		/*
+		 * Avoid overflow.
+		 * If inc causes overflow, untouch promisc and return error.
+		 */
+		if (inc < 0)
+			dev->flags &= ~IFF_PROMISC;
+		else {
+			dev->promiscuity -= inc;
+			printk(KERN_WARNING "%s: promiscuity touches roof, "
+				"set promiscuity failed, promiscuity feature "
+				"of device might be broken.\n", dev->name);
+			return -EOVERFLOW;
+		}
+	}
 	if (dev->flags != old_flags) {
 		printk(KERN_INFO "device %s %s promiscuous mode\n",
 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -2798,6 +2811,7 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)
 		if (dev->change_rx_flags)
 			dev->change_rx_flags(dev, IFF_PROMISC);
 	}
+	return 0;
 }
 
 /**
@@ -2809,14 +2823,19 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)
  *	remains above zero the interface remains promiscuous. Once it hits zero
  *	the device reverts back to normal filtering operation. A negative inc
  *	value is used to drop promiscuity on the device.
+ *	Return 0 if successful or a negative errno code on error.
  */
-void dev_set_promiscuity(struct net_device *dev, int inc)
+int dev_set_promiscuity(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
+	int err;
 
-	__dev_set_promiscuity(dev, inc);
+	err = __dev_set_promiscuity(dev, inc);
+	if (!err)
+		return err;
 	if (dev->flags != old_flags)
 		dev_set_rx_mode(dev);
+	return err;
 }
 
 /**
@@ -2829,22 +2848,38 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
  *	to all interfaces. Once it hits zero the device reverts back to normal
  *	filtering operation. A negative @inc value is used to drop the counter
  *	when releasing a resource needing all multicasts.
+ *	Return 0 if successful or a negative errno code on error.
  */
 
-void dev_set_allmulti(struct net_device *dev, int inc)
+int dev_set_allmulti(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
 	ASSERT_RTNL();
 
 	dev->flags |= IFF_ALLMULTI;
-	if ((dev->allmulti += inc) == 0)
-		dev->flags &= ~IFF_ALLMULTI;
+	dev->allmulti += inc;
+	if (dev->allmulti == 0) {
+		/*
+		 * Avoid overflow.
+		 * If inc causes overflow, untouch allmulti and return error.
+		 */
+		if (inc < 0)
+			dev->flags &= ~IFF_ALLMULTI;
+		else {
+			dev->allmulti -= inc;
+			printk(KERN_WARNING "%s: allmulti touches roof, "
+				"set allmulti failed, allmulti feature of "
+				"device might be broken.\n", dev->name);
+			return -EOVERFLOW;
+		}
+	}
 	if (dev->flags ^ old_flags) {
 		if (dev->change_rx_flags)
 			dev->change_rx_flags(dev, IFF_ALLMULTI);
 		dev_set_rx_mode(dev);
 	}
+	return 0;
 }
 
 /*
-- 
cgit v1.2.1


From 0187bdfb05674147774ca79a79942537f3ad54bd Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 19 Jun 2008 16:15:47 -0700
Subject: net: Disable LRO on devices that are forwarding

Large Receive Offload (LRO) is only appropriate for packets that are
destined for the host, and should be disabled if received packets may be
forwarded.  It can also confuse the GSO on output.

Add dev_disable_lro() function which uses the appropriate ethtool ops to
disable LRO if enabled.

Add calls to dev_disable_lro() in br_add_if() and functions that enable
IPv4 and IPv6 forwarding.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a495f712d38c..f6944ecd5b2e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -90,6 +90,7 @@
 #include <linux/if_ether.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/ethtool.h>
 #include <linux/notifier.h>
 #include <linux/skbuff.h>
 #include <net/net_namespace.h>
@@ -1123,6 +1124,29 @@ int dev_close(struct net_device *dev)
 }
 
 
+/**
+ *	dev_disable_lro - disable Large Receive Offload on a device
+ *	@dev: device
+ *
+ *	Disable Large Receive Offload (LRO) on a net device.  Must be
+ *	called under RTNL.  This is needed if received packets may be
+ *	forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
+	    dev->ethtool_ops->set_flags) {
+		u32 flags = dev->ethtool_ops->get_flags(dev);
+		if (flags & ETH_FLAG_LRO) {
+			flags &= ~ETH_FLAG_LRO;
+			dev->ethtool_ops->set_flags(dev, flags);
+		}
+	}
+	WARN_ON(dev->features & NETIF_F_LRO);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+
 static int dev_boot_phase = 1;
 
 /*
-- 
cgit v1.2.1


From 4b5a698ef423eebc37cfacc6d3376d6dffd5bf83 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 6 Jul 2008 15:49:08 -0700
Subject: net: fix dev_set_promiscuity() breakage

Commit dad9b335 (netdevice: Fix promiscuity and allmulti overflow) broke
dev_set_promiscuity() by returning on success without reprogramming the
device.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index bfa9a6a951dd..75933932463d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2859,7 +2859,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
 	int err;
 
 	err = __dev_set_promiscuity(dev, inc);
-	if (!err)
+	if (err < 0)
 		return err;
 	if (dev->flags != old_flags)
 		dev_set_rx_mode(dev);
-- 
cgit v1.2.1


From bb949fbd1878973c3539d9aecff52f284482a937 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 16:55:56 -0700
Subject: netdev: Create netdev_queue abstraction.

A netdev_queue is an entity managed by a qdisc.

Currently there is one RX and one TX queue, and a netdev_queue merely
contains a backpointer to the net_device.

The Qdisc struct is augmented with a netdev_queue pointer as well.

Eventually the 'dev' Qdisc member will go away and we will have the
resulting hierarchy:

	net_device --> netdev_queue --> Qdisc

Also, qdisc_alloc() and qdisc_create_dflt() now take a netdev_queue
pointer argument.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 75933932463d..9b281c906eb0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4072,6 +4072,12 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
 	return &dev->stats;
 }
 
+static void netdev_init_queues(struct net_device *dev)
+{
+	dev->rx_queue.dev = dev;
+	dev->tx_queue.dev = dev;
+}
+
 /**
  *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
@@ -4124,6 +4130,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev->egress_subqueue_count = queue_count;
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	netdev_init_queues(dev);
+
 	dev->get_stats = internal_stats;
 	netpoll_netdev_init(dev);
 	setup(dev);
-- 
cgit v1.2.1


From dc2b48475a0a36f8b3bbb2da60d3a006dc5c2c84 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 17:18:23 -0700
Subject: netdev: Move queue_lock into struct netdev_queue.

The lock is now an attribute of the device queue.

One thing to notice is that "suspicious" places
emerge which will need specific training about
multiple queue handling.  They are so marked with
explicit "netdev->rx_queue" and "netdev->tx_queue"
references.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9b281c906eb0..05011048b86c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1667,6 +1667,7 @@ out_kfree_skb:
 int dev_queue_xmit(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
+	struct netdev_queue *txq;
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
@@ -1699,14 +1700,15 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 gso:
-	spin_lock_prefetch(&dev->queue_lock);
+	txq = &dev->tx_queue;
+	spin_lock_prefetch(&txq->lock);
 
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
 	rcu_read_lock_bh();
 
-	/* Updates of qdisc are serialized by queue_lock.
+	/* Updates of qdisc are serialized by queue->lock.
 	 * The struct Qdisc which is pointed to by qdisc is now a
 	 * rcu structure - it may be accessed without acquiring
 	 * a lock (but the structure may be stale.) The freeing of the
@@ -1714,7 +1716,7 @@ gso:
 	 * more references to it.
 	 *
 	 * If the qdisc has an enqueue function, we still need to
-	 * hold the queue_lock before calling it, since queue_lock
+	 * hold the queue->lock before calling it, since queue->lock
 	 * also serializes access to the device queue.
 	 */
 
@@ -1724,19 +1726,19 @@ gso:
 #endif
 	if (q->enqueue) {
 		/* Grab device queue */
-		spin_lock(&dev->queue_lock);
+		spin_lock(&txq->lock);
 		q = dev->qdisc;
 		if (q->enqueue) {
 			/* reset queue_mapping to zero */
 			skb_set_queue_mapping(skb, 0);
 			rc = q->enqueue(skb, q);
 			qdisc_run(dev);
-			spin_unlock(&dev->queue_lock);
+			spin_unlock(&txq->lock);
 
 			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
 			goto out;
 		}
-		spin_unlock(&dev->queue_lock);
+		spin_unlock(&txq->lock);
 	}
 
 	/* The device has no queue. Common case for software devices:
@@ -1919,14 +1921,17 @@ static void net_tx_action(struct softirq_action *h)
 
 		while (head) {
 			struct net_device *dev = head;
+			struct netdev_queue *txq;
 			head = head->next_sched;
 
+			txq = &dev->tx_queue;
+
 			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
-			if (spin_trylock(&dev->queue_lock)) {
+			if (spin_trylock(&txq->lock)) {
 				qdisc_run(dev);
-				spin_unlock(&dev->queue_lock);
+				spin_unlock(&txq->lock);
 			} else {
 				netif_schedule(dev);
 			}
@@ -3787,7 +3792,6 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(!dev_net(dev));
 	net = dev_net(dev);
 
-	spin_lock_init(&dev->queue_lock);
 	spin_lock_init(&dev->_xmit_lock);
 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
 	dev->xmit_lock_owner = -1;
@@ -4072,10 +4076,17 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
 	return &dev->stats;
 }
 
+static void netdev_init_one_queue(struct net_device *dev,
+				  struct netdev_queue *queue)
+{
+	spin_lock_init(&queue->lock);
+	queue->dev = dev;
+}
+
 static void netdev_init_queues(struct net_device *dev)
 {
-	dev->rx_queue.dev = dev;
-	dev->tx_queue.dev = dev;
+	netdev_init_one_queue(dev, &dev->rx_queue);
+	netdev_init_one_queue(dev, &dev->tx_queue);
 }
 
 /**
-- 
cgit v1.2.1


From 555353cfa1aee293de445bfa6de43276138ddd82 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 17:33:13 -0700
Subject: netdev: The ingress_lock member is no longer needed.

Every qdisc is assosciated with a queue, and in the case of ingress
qdiscs that will now be netdev->rx_queue so using that queue's lock is
the thing to do.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 05011048b86c..2322fb69fd53 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2014,10 +2014,11 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
  */
 static int ing_filter(struct sk_buff *skb)
 {
-	struct Qdisc *q;
 	struct net_device *dev = skb->dev;
-	int result = TC_ACT_OK;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
+	struct netdev_queue *rxq;
+	int result = TC_ACT_OK;
+	struct Qdisc *q;
 
 	if (MAX_RED_LOOP < ttl++) {
 		printk(KERN_WARNING
@@ -2029,10 +2030,12 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	spin_lock(&dev->ingress_lock);
+	rxq = &dev->rx_queue;
+
+	spin_lock(&rxq->lock);
 	if ((q = dev->qdisc_ingress) != NULL)
 		result = q->enqueue(skb, q);
-	spin_unlock(&dev->ingress_lock);
+	spin_unlock(&rxq->lock);
 
 	return result;
 }
@@ -3795,7 +3798,6 @@ int register_netdevice(struct net_device *dev)
 	spin_lock_init(&dev->_xmit_lock);
 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
 	dev->xmit_lock_owner = -1;
-	spin_lock_init(&dev->ingress_lock);
 
 	dev->iflink = -1;
 
-- 
cgit v1.2.1


From b0e1e6462df3c5944010b3328a546d8fe5d932cd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 17:42:10 -0700
Subject: netdev: Move rest of qdisc state into struct netdev_queue

Now qdisc, qdisc_sleeping, and qdisc_list also live there.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2322fb69fd53..ce79c28d739d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1720,14 +1720,14 @@ gso:
 	 * also serializes access to the device queue.
 	 */
 
-	q = rcu_dereference(dev->qdisc);
+	q = rcu_dereference(txq->qdisc);
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
 #endif
 	if (q->enqueue) {
 		/* Grab device queue */
 		spin_lock(&txq->lock);
-		q = dev->qdisc;
+		q = txq->qdisc;
 		if (q->enqueue) {
 			/* reset queue_mapping to zero */
 			skb_set_queue_mapping(skb, 0);
-- 
cgit v1.2.1


From 816f3258e70db38d6d92c8d871377179fd69160f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 22:49:00 -0700
Subject: netdev: Kill qdisc_ingress, use netdev->rx_queue.qdisc instead.

Now that our qdisc management is bi-directional, per-queue, and fully
orthogonal, there is no reason to have a special ingress qdisc pointer
in struct net_device.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ce79c28d739d..ab760a954d99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2033,7 +2033,7 @@ static int ing_filter(struct sk_buff *skb)
 	rxq = &dev->rx_queue;
 
 	spin_lock(&rxq->lock);
-	if ((q = dev->qdisc_ingress) != NULL)
+	if ((q = rxq->qdisc) != NULL)
 		result = q->enqueue(skb, q);
 	spin_unlock(&rxq->lock);
 
@@ -2044,7 +2044,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (!skb->dev->qdisc_ingress)
+	if (!skb->dev->rx_queue.qdisc)
 		goto out;
 
 	if (*pt_prev) {
-- 
cgit v1.2.1


From ee609cb36220d18c0cf476b066a5ab7e6f6d3a69 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 22:58:37 -0700
Subject: netdev: Move next_sched into struct netdev_queue.

We schedule queues, not the device, for output queue processing in BH.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ab760a954d99..d6b8d3c3e6ec 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1323,13 +1323,14 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 void __netif_schedule(struct net_device *dev)
 {
 	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
+		struct netdev_queue *txq = &dev->tx_queue;
 		unsigned long flags;
 		struct softnet_data *sd;
 
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
-		dev->next_sched = sd->output_queue;
-		sd->output_queue = dev;
+		txq->next_sched = sd->output_queue;
+		sd->output_queue = txq;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
@@ -1912,7 +1913,7 @@ static void net_tx_action(struct softirq_action *h)
 	}
 
 	if (sd->output_queue) {
-		struct net_device *head;
+		struct netdev_queue *head;
 
 		local_irq_disable();
 		head = sd->output_queue;
@@ -1920,12 +1921,10 @@ static void net_tx_action(struct softirq_action *h)
 		local_irq_enable();
 
 		while (head) {
-			struct net_device *dev = head;
-			struct netdev_queue *txq;
+			struct netdev_queue *txq = head;
+			struct net_device *dev = txq->dev;
 			head = head->next_sched;
 
-			txq = &dev->tx_queue;
-
 			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
@@ -4346,7 +4345,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 			    void *ocpu)
 {
 	struct sk_buff **list_skb;
-	struct net_device **list_net;
+	struct netdev_queue **list_net;
 	struct sk_buff *skb;
 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
 	struct softnet_data *sd, *oldsd;
-- 
cgit v1.2.1


From 86d804e10a37cd86f16bf72386c37e843a98a74b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 23:11:25 -0700
Subject: netdev: Make netif_schedule() routines work with netdev_queue
 objects.

Only plain netif_schedule() remains taking a net_device, mostly as a
compatability item while we transition the rest of these interfaces.

Everything else calls netif_schedule_queue() or __netif_schedule(),
both of which take a netdev_queue pointer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d6b8d3c3e6ec..0dc888ad4217 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1320,12 +1320,13 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 }
 
 
-void __netif_schedule(struct net_device *dev)
+void __netif_schedule(struct netdev_queue *txq)
 {
+	struct net_device *dev = txq->dev;
+
 	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
-		struct netdev_queue *txq = &dev->tx_queue;
-		unsigned long flags;
 		struct softnet_data *sd;
+		unsigned long flags;
 
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
@@ -1932,7 +1933,7 @@ static void net_tx_action(struct softirq_action *h)
 				qdisc_run(dev);
 				spin_unlock(&txq->lock);
 			} else {
-				netif_schedule(dev);
+				netif_schedule_queue(txq);
 			}
 		}
 	}
-- 
cgit v1.2.1


From eb6aafe3f843cb0e939546c03540a3b4911b6964 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 23:12:38 -0700
Subject: pkt_sched: Make qdisc_run take a netdev_queue.

This allows us to use this calling convention all the way down into
qdisc_restart().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0dc888ad4217..0218b0b9be80 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1734,7 +1734,7 @@ gso:
 			/* reset queue_mapping to zero */
 			skb_set_queue_mapping(skb, 0);
 			rc = q->enqueue(skb, q);
-			qdisc_run(dev);
+			qdisc_run(txq);
 			spin_unlock(&txq->lock);
 
 			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
@@ -1930,7 +1930,7 @@ static void net_tx_action(struct softirq_action *h)
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
 			if (spin_trylock(&txq->lock)) {
-				qdisc_run(dev);
+				qdisc_run(txq);
 				spin_unlock(&txq->lock);
 			} else {
 				netif_schedule_queue(txq);
-- 
cgit v1.2.1


From c773e847ea8f6812804e40f52399c6921a00eab1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Jul 2008 23:13:53 -0700
Subject: netdev: Move _xmit_lock and xmit_lock_owner into netdev_queue.

Accesses are mostly structured such that when there are multiple TX
queues the code transformations will be a little bit simpler.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0218b0b9be80..a29a359b15d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -258,7 +258,7 @@ DEFINE_PER_CPU(struct softnet_data, softnet_data);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
- * register_netdevice() inits dev->_xmit_lock and sets lockdep class
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  * according to dev->type
  */
 static const unsigned short netdev_lock_type[] =
@@ -1758,19 +1758,19 @@ gso:
 	if (dev->flags & IFF_UP) {
 		int cpu = smp_processor_id(); /* ok because BHs are off */
 
-		if (dev->xmit_lock_owner != cpu) {
+		if (txq->xmit_lock_owner != cpu) {
 
-			HARD_TX_LOCK(dev, cpu);
+			HARD_TX_LOCK(dev, txq, cpu);
 
 			if (!netif_queue_stopped(dev) &&
 			    !netif_subqueue_stopped(dev, skb)) {
 				rc = 0;
 				if (!dev_hard_start_xmit(skb, dev)) {
-					HARD_TX_UNLOCK(dev);
+					HARD_TX_UNLOCK(dev, txq);
 					goto out;
 				}
 			}
-			HARD_TX_UNLOCK(dev);
+			HARD_TX_UNLOCK(dev, txq);
 			if (net_ratelimit())
 				printk(KERN_CRIT "Virtual device %s asks to "
 				       "queue packet!\n", dev->name);
@@ -3761,6 +3761,20 @@ static void rollback_registered(struct net_device *dev)
 	dev_put(dev);
 }
 
+static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue,
+					  struct net_device *dev)
+{
+	spin_lock_init(&dev_queue->_xmit_lock);
+	netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type);
+	dev_queue->xmit_lock_owner = -1;
+}
+
+static void netdev_init_queue_locks(struct net_device *dev)
+{
+	__netdev_init_queue_locks_one(&dev->tx_queue, dev);
+	__netdev_init_queue_locks_one(&dev->rx_queue, dev);
+}
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -3795,9 +3809,7 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(!dev_net(dev));
 	net = dev_net(dev);
 
-	spin_lock_init(&dev->_xmit_lock);
-	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
-	dev->xmit_lock_owner = -1;
+	netdev_init_queue_locks(dev);
 
 	dev->iflink = -1;
 
-- 
cgit v1.2.1


From bc1d0411b804ad190cdadabac48a10067f17b9e6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 14 Jul 2008 22:49:30 -0700
Subject: vlan: deliver packets received with VLAN acceleration to network taps

When VLAN header stripping is used, packets currently bypass packet
sockets (and other network taps) completely. For locally existing
VLANs, they appear directly on the VLAN device, for unknown VLANs
they are silently dropped.

Add a new function netif_nit_deliver() to deliver incoming packets
to all network interface taps and use it in __vlan_hwaccel_rx() to
make VLAN packets visible on the underlying device.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a29a359b15d1..feaab4898a5b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2068,6 +2068,33 @@ out:
 }
 #endif
 
+/*
+ * 	netif_nit_deliver - deliver received packets to network taps
+ * 	@skb: buffer
+ *
+ * 	This function is used to deliver incoming packets to network
+ * 	taps. It should be used when the normal netif_receive_skb path
+ * 	is bypassed, for example because of VLAN acceleration.
+ */
+void netif_nit_deliver(struct sk_buff *skb)
+{
+	struct packet_type *ptype;
+
+	if (list_empty(&ptype_all))
+		return;
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb->mac_len = skb->network_header - skb->mac_header;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+		if (!ptype->dev || ptype->dev == skb->dev)
+			deliver_skb(skb, ptype, skb->dev);
+	}
+	rcu_read_unlock();
+}
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
-- 
cgit v1.2.1


From f1f28aa3510ddb84c966bac65611bb866c77a092 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Jul 2008 00:08:33 -0700
Subject: netdev: Add addr_list_lock to struct net_device.

This will be used to protect the per-device unicast and multicast
address lists, as well as the callbacks into the drivers which
configure such state such as ->set_rx_mode() and ->set_multicast_list().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index feaab4898a5b..d933d1bfa6fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3836,6 +3836,7 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(!dev_net(dev));
 	net = dev_net(dev);
 
+	spin_lock_init(&dev->addr_list_lock);
 	netdev_init_queue_locks(dev);
 
 	dev->iflink = -1;
-- 
cgit v1.2.1


From e308a5d806c852f56590ffdd3834d0df0cbed8d7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Jul 2008 00:13:44 -0700
Subject: netdev: Add netdev->addr_list_lock protection.

Add netif_addr_{lock,unlock}{,_bh}() helpers.

Use them to protect operations that operate on or read
the network device unicast and multicast address lists.

Also use them in cases where the code simply wants to
block calls into the driver's ->set_rx_mode() and
->set_multicast_list() methods.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d933d1bfa6fa..ef1502d71f25 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2982,7 +2982,9 @@ void __dev_set_rx_mode(struct net_device *dev)
 void dev_set_rx_mode(struct net_device *dev)
 {
 	netif_tx_lock_bh(dev);
+	netif_addr_lock(dev);
 	__dev_set_rx_mode(dev);
+	netif_addr_unlock(dev);
 	netif_tx_unlock_bh(dev);
 }
 
@@ -3062,9 +3064,11 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
 	ASSERT_RTNL();
 
 	netif_tx_lock_bh(dev);
+	netif_addr_lock(dev);
 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
 	if (!err)
 		__dev_set_rx_mode(dev);
+	netif_addr_unlock(dev);
 	netif_tx_unlock_bh(dev);
 	return err;
 }
@@ -3088,9 +3092,11 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen)
 	ASSERT_RTNL();
 
 	netif_tx_lock_bh(dev);
+	netif_addr_lock(dev);
 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
 	if (!err)
 		__dev_set_rx_mode(dev);
+	netif_addr_unlock(dev);
 	netif_tx_unlock_bh(dev);
 	return err;
 }
@@ -3159,10 +3165,12 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 	int err = 0;
 
 	netif_tx_lock_bh(to);
+	netif_addr_lock(to);
 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
 			      &from->uc_list, &from->uc_count);
 	if (!err)
 		__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
 	netif_tx_unlock_bh(to);
 	return err;
 }
@@ -3180,13 +3188,17 @@ EXPORT_SYMBOL(dev_unicast_sync);
 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 {
 	netif_tx_lock_bh(from);
+	netif_addr_lock(from);
 	netif_tx_lock_bh(to);
+	netif_addr_lock(to);
 
 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
 			  &from->uc_list, &from->uc_count);
 	__dev_set_rx_mode(to);
 
+	netif_addr_unlock(to);
 	netif_tx_unlock_bh(to);
+	netif_addr_unlock(from);
 	netif_tx_unlock_bh(from);
 }
 EXPORT_SYMBOL(dev_unicast_unsync);
@@ -3208,6 +3220,7 @@ static void __dev_addr_discard(struct dev_addr_list **list)
 static void dev_addr_discard(struct net_device *dev)
 {
 	netif_tx_lock_bh(dev);
+	netif_addr_lock(dev);
 
 	__dev_addr_discard(&dev->uc_list);
 	dev->uc_count = 0;
@@ -3215,6 +3228,7 @@ static void dev_addr_discard(struct net_device *dev)
 	__dev_addr_discard(&dev->mc_list);
 	dev->mc_count = 0;
 
+	netif_addr_unlock(dev);
 	netif_tx_unlock_bh(dev);
 }
 
-- 
cgit v1.2.1


From b9e40857682ecfc5bcd0356a23ff409883ffb982 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Jul 2008 00:15:08 -0700
Subject: netdev: Do not use TX lock to protect address lists.

Now that we have a specific lock to protect the network
device unicast and multicast lists, remove extraneous
grabs of the TX lock in cases where the code only needs
address list protection.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ef1502d71f25..9b49f74a9820 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2981,11 +2981,9 @@ void __dev_set_rx_mode(struct net_device *dev)
 
 void dev_set_rx_mode(struct net_device *dev)
 {
-	netif_tx_lock_bh(dev);
-	netif_addr_lock(dev);
+	netif_addr_lock_bh(dev);
 	__dev_set_rx_mode(dev);
-	netif_addr_unlock(dev);
-	netif_tx_unlock_bh(dev);
+	netif_addr_unlock_bh(dev);
 }
 
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
@@ -3063,13 +3061,11 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
 
 	ASSERT_RTNL();
 
-	netif_tx_lock_bh(dev);
-	netif_addr_lock(dev);
+	netif_addr_lock_bh(dev);
 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
 	if (!err)
 		__dev_set_rx_mode(dev);
-	netif_addr_unlock(dev);
-	netif_tx_unlock_bh(dev);
+	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_delete);
@@ -3091,13 +3087,11 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen)
 
 	ASSERT_RTNL();
 
-	netif_tx_lock_bh(dev);
-	netif_addr_lock(dev);
+	netif_addr_lock_bh(dev);
 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
 	if (!err)
 		__dev_set_rx_mode(dev);
-	netif_addr_unlock(dev);
-	netif_tx_unlock_bh(dev);
+	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_add);
@@ -3164,14 +3158,12 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 {
 	int err = 0;
 
-	netif_tx_lock_bh(to);
-	netif_addr_lock(to);
+	netif_addr_lock_bh(to);
 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
 			      &from->uc_list, &from->uc_count);
 	if (!err)
 		__dev_set_rx_mode(to);
-	netif_addr_unlock(to);
-	netif_tx_unlock_bh(to);
+	netif_addr_unlock_bh(to);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_sync);
@@ -3187,9 +3179,7 @@ EXPORT_SYMBOL(dev_unicast_sync);
  */
 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 {
-	netif_tx_lock_bh(from);
-	netif_addr_lock(from);
-	netif_tx_lock_bh(to);
+	netif_addr_lock_bh(from);
 	netif_addr_lock(to);
 
 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
@@ -3197,9 +3187,7 @@ void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 	__dev_set_rx_mode(to);
 
 	netif_addr_unlock(to);
-	netif_tx_unlock_bh(to);
-	netif_addr_unlock(from);
-	netif_tx_unlock_bh(from);
+	netif_addr_unlock_bh(from);
 }
 EXPORT_SYMBOL(dev_unicast_unsync);
 
@@ -3219,8 +3207,7 @@ static void __dev_addr_discard(struct dev_addr_list **list)
 
 static void dev_addr_discard(struct net_device *dev)
 {
-	netif_tx_lock_bh(dev);
-	netif_addr_lock(dev);
+	netif_addr_lock_bh(dev);
 
 	__dev_addr_discard(&dev->uc_list);
 	dev->uc_count = 0;
@@ -3228,8 +3215,7 @@ static void dev_addr_discard(struct net_device *dev)
 	__dev_addr_discard(&dev->mc_list);
 	dev->mc_count = 0;
 
-	netif_addr_unlock(dev);
-	netif_tx_unlock_bh(dev);
+	netif_addr_unlock_bh(dev);
 }
 
 unsigned dev_get_flags(const struct net_device *dev)
-- 
cgit v1.2.1


From e8a0464cc950972824e2e128028ae3db666ec1ed Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 17 Jul 2008 00:34:19 -0700
Subject: netdev: Allocate multiple queues for TX.

alloc_netdev_mq() now allocates an array of netdev_queue
structures for TX, based upon the queue_count argument.

Furthermore, all accesses to the TX queues are now vectored
through the netdev_get_tx_queue() and netdev_for_each_tx_queue()
interfaces.  This makes it easy to grep the tree for all
things that want to get to a TX queue of a net device.

Problem spots which are not really multiqueue aware yet, and
only work with one queue, can easily be spotted by grepping
for all netdev_get_tx_queue() calls that pass in a zero index.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9b49f74a9820..69378f250695 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1666,6 +1666,12 @@ out_kfree_skb:
  *          --BLG
  */
 
+static struct netdev_queue *dev_pick_tx(struct net_device *dev,
+					struct sk_buff *skb)
+{
+	return netdev_get_tx_queue(dev, 0);
+}
+
 int dev_queue_xmit(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
@@ -1702,7 +1708,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 gso:
-	txq = &dev->tx_queue;
+	txq = dev_pick_tx(dev, skb);
 	spin_lock_prefetch(&txq->lock);
 
 	/* Disable soft irqs for various locks below. Also
@@ -3788,8 +3794,9 @@ static void rollback_registered(struct net_device *dev)
 	dev_put(dev);
 }
 
-static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue,
-					  struct net_device *dev)
+static void __netdev_init_queue_locks_one(struct net_device *dev,
+					  struct netdev_queue *dev_queue,
+					  void *_unused)
 {
 	spin_lock_init(&dev_queue->_xmit_lock);
 	netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type);
@@ -3798,8 +3805,8 @@ static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue,
 
 static void netdev_init_queue_locks(struct net_device *dev)
 {
-	__netdev_init_queue_locks_one(&dev->tx_queue, dev);
-	__netdev_init_queue_locks_one(&dev->rx_queue, dev);
+	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
+	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
 }
 
 /**
@@ -4119,7 +4126,8 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
 }
 
 static void netdev_init_one_queue(struct net_device *dev,
-				  struct netdev_queue *queue)
+				  struct netdev_queue *queue,
+				  void *_unused)
 {
 	spin_lock_init(&queue->lock);
 	queue->dev = dev;
@@ -4127,8 +4135,8 @@ static void netdev_init_one_queue(struct net_device *dev,
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->rx_queue);
-	netdev_init_one_queue(dev, &dev->tx_queue);
+	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
+	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 }
 
 /**
@@ -4145,9 +4153,10 @@ static void netdev_init_queues(struct net_device *dev)
 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		void (*setup)(struct net_device *), unsigned int queue_count)
 {
-	void *p;
+	struct netdev_queue *tx;
 	struct net_device *dev;
 	int alloc_size;
+	void *p;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -4167,11 +4176,22 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	tx = kzalloc(sizeof(struct netdev_queue) * queue_count, GFP_KERNEL);
+	if (!tx) {
+		printk(KERN_ERR "alloc_netdev: Unable to allocate "
+		       "tx qdiscs.\n");
+		kfree(p);
+		return NULL;
+	}
+
 	dev = (struct net_device *)
 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
 	dev->padded = (char *)dev - (char *)p;
 	dev_net_set(dev, &init_net);
 
+	dev->_tx = tx;
+	dev->num_tx_queues = queue_count;
+
 	if (sizeof_priv) {
 		dev->priv = ((char *)dev +
 			     ((sizeof(struct net_device) +
@@ -4205,6 +4225,8 @@ void free_netdev(struct net_device *dev)
 {
 	release_net(dev_net(dev));
 
+	kfree(dev->_tx);
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);
-- 
cgit v1.2.1


From fd2ea0a79faad824258af5dcec1927aa24d81c16 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 17 Jul 2008 01:56:23 -0700
Subject: net: Use queue aware tests throughout.

This effectively "flips the switch" by making the core networking
and multiqueue-aware drivers use the new TX multiqueue structures.

Non-multiqueue drivers need no changes.  The interfaces they use such
as netif_stop_queue() degenerate into an operation on TX queue zero.
So everything "just works" for them.

Code that really wants to do "X" to all TX queues now invokes a
routine that does so, such as netif_tx_wake_all_queues(),
netif_tx_stop_all_queues(), etc.

pktgen and netpoll required a little bit more surgery than the others.

In particular the pktgen changes, whilst functional, could be largely
improved.  The initial check in pktgen_xmit() will sometimes check the
wrong queue, which is mostly harmless.  The thing to do is probably to
invoke fill_packet() earlier.

The bulk of the netpoll changes is to make the code operate solely on
the TX queue indicated by by the SKB queue mapping.

Setting of the SKB queue mapping is entirely confined inside of
net/core/dev.c:dev_pick_tx().  If we end up needing any kind of
special semantics (drops, for example) it will be implemented here.

Finally, we now have a "real_num_tx_queues" which is where the driver
indicates how many TX queues are actually active.

With IGB changes from Jeff Kirsher.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 69378f250695..f027a1ac4fbb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1598,7 +1598,8 @@ static int dev_gso_segment(struct sk_buff *skb)
 	return 0;
 }
 
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+			struct netdev_queue *txq)
 {
 	if (likely(!skb->next)) {
 		if (!list_empty(&ptype_all))
@@ -1627,9 +1628,7 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
-		if (unlikely((netif_queue_stopped(dev) ||
-			     netif_subqueue_stopped(dev, skb)) &&
-			     skb->next))
+		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
 
@@ -1669,7 +1668,10 @@ out_kfree_skb:
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
-	return netdev_get_tx_queue(dev, 0);
+	u16 queue_index = 0;
+
+	skb_set_queue_mapping(skb, queue_index);
+	return netdev_get_tx_queue(dev, queue_index);
 }
 
 int dev_queue_xmit(struct sk_buff *skb)
@@ -1737,8 +1739,6 @@ gso:
 		spin_lock(&txq->lock);
 		q = txq->qdisc;
 		if (q->enqueue) {
-			/* reset queue_mapping to zero */
-			skb_set_queue_mapping(skb, 0);
 			rc = q->enqueue(skb, q);
 			qdisc_run(txq);
 			spin_unlock(&txq->lock);
@@ -1768,10 +1768,9 @@ gso:
 
 			HARD_TX_LOCK(dev, txq, cpu);
 
-			if (!netif_queue_stopped(dev) &&
-			    !netif_subqueue_stopped(dev, skb)) {
+			if (!netif_tx_queue_stopped(txq)) {
 				rc = 0;
-				if (!dev_hard_start_xmit(skb, dev)) {
+				if (!dev_hard_start_xmit(skb, dev, txq)) {
 					HARD_TX_UNLOCK(dev, txq);
 					goto out;
 				}
@@ -4160,8 +4159,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
-	alloc_size = sizeof(struct net_device) +
-		     sizeof(struct net_device_subqueue) * (queue_count - 1);
+	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
 		/* ensure 32-byte alignment of private area */
 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
@@ -4191,16 +4189,14 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->_tx = tx;
 	dev->num_tx_queues = queue_count;
+	dev->real_num_tx_queues = queue_count;
 
 	if (sizeof_priv) {
 		dev->priv = ((char *)dev +
-			     ((sizeof(struct net_device) +
-			       (sizeof(struct net_device_subqueue) *
-				(queue_count - 1)) + NETDEV_ALIGN_CONST)
+			     ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
 			      & ~NETDEV_ALIGN_CONST));
 	}
 
-	dev->egress_subqueue_count = queue_count;
 	dev->gso_max_size = GSO_MAX_SIZE;
 
 	netdev_init_queues(dev);
-- 
cgit v1.2.1


From eae792b722fef08dcf3aee88266ee7def9710757 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Jul 2008 03:03:33 -0700
Subject: netdev: Add netdev->select_queue() method.

Devices or device layers can set this to control the queue selection
performed by dev_pick_tx().

This function runs under RCU protection, which allows overriding
functions to have some way of synchronizing with things like dynamic
->real_num_tx_queues adjustments.

This makes the spinlock prefetch in dev_queue_xmit() a little bit
less effective, but that's the price right now for correctness.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index f027a1ac4fbb..7ca9564d2f44 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1670,6 +1670,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 {
 	u16 queue_index = 0;
 
+	if (dev->select_queue)
+		queue_index = dev->select_queue(dev, skb);
+
 	skb_set_queue_mapping(skb, queue_index);
 	return netdev_get_tx_queue(dev, queue_index);
 }
@@ -1710,14 +1713,14 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 gso:
-	txq = dev_pick_tx(dev, skb);
-	spin_lock_prefetch(&txq->lock);
-
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
 	rcu_read_lock_bh();
 
+	txq = dev_pick_tx(dev, skb);
+	spin_lock_prefetch(&txq->lock);
+
 	/* Updates of qdisc are serialized by queue->lock.
 	 * The struct Qdisc which is pointed to by qdisc is now a
 	 * rcu structure - it may be accessed without acquiring
-- 
cgit v1.2.1


From 8f0f2223cc08a5ae9a77f40edfe02e8a9f1abd77 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Jul 2008 03:47:03 -0700
Subject: net: Implement simple sw TX hashing.

It just xor hashes over IPv4/IPv6 addresses and ports of transport.

The only assumption it makes is that skb_network_header() is set
correctly.

With bug fixes from Eric Dumazet.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7ca9564d2f44..467bfb325123 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -121,6 +121,9 @@
 #include <linux/ctype.h>
 #include <linux/if_arp.h>
 #include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
 
 #include "net-sysfs.h"
 
@@ -1665,6 +1668,53 @@ out_kfree_skb:
  *          --BLG
  */
 
+static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 *addr, *ports, hash, ihl;
+	u8 ip_proto;
+	int alen;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		ip_proto = ip_hdr(skb)->protocol;
+		addr = &ip_hdr(skb)->saddr;
+		ihl = ip_hdr(skb)->ihl;
+		alen = 2;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		ip_proto = ipv6_hdr(skb)->nexthdr;
+		addr = &ipv6_hdr(skb)->saddr.s6_addr32[0];
+		ihl = (40 >> 2);
+		alen = 8;
+		break;
+	default:
+		return 0;
+	}
+
+	ports = (u32 *) (skb_network_header(skb) + (ihl * 4));
+
+	hash = 0;
+	while (alen--)
+		hash ^= *addr++;
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		hash ^= *ports;
+		break;
+
+	default:
+		break;
+	}
+
+	return hash % dev->real_num_tx_queues;
+}
+
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
@@ -1672,6 +1722,8 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 
 	if (dev->select_queue)
 		queue_index = dev->select_queue(dev, skb);
+	else if (dev->real_num_tx_queues > 1)
+		queue_index = simple_tx_hash(dev, skb);
 
 	skb_set_queue_mapping(skb, queue_index);
 	return netdev_get_tx_queue(dev, queue_index);
-- 
cgit v1.2.1


From 37437bb2e1ae8af470dfcd5b4ff454110894ccaf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 16 Jul 2008 02:15:04 -0700
Subject: pkt_sched: Schedule qdiscs instead of netdev_queue.

When we have shared qdiscs, packets come out of the qdiscs
for multiple transmit queues.

Therefore it doesn't make any sense to schedule the transmit
queue when logically we cannot know ahead of time the TX
queue of the SKB that the qdisc->dequeue() will give us.

Just for sanity I added a BUG check to make sure we never
get into a state where the noop_qdisc is scheduled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 68 ++++++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 40 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 467bfb325123..0b909b74f698 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1323,18 +1323,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 }
 
 
-void __netif_schedule(struct netdev_queue *txq)
+void __netif_schedule(struct Qdisc *q)
 {
-	struct net_device *dev = txq->dev;
+	BUG_ON(q == &noop_qdisc);
 
-	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
+	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {
 		struct softnet_data *sd;
 		unsigned long flags;
 
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
-		txq->next_sched = sd->output_queue;
-		sd->output_queue = txq;
+		q->next_sched = sd->output_queue;
+		sd->output_queue = q;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
@@ -1771,37 +1771,23 @@ gso:
 	rcu_read_lock_bh();
 
 	txq = dev_pick_tx(dev, skb);
-	spin_lock_prefetch(&txq->lock);
-
-	/* Updates of qdisc are serialized by queue->lock.
-	 * The struct Qdisc which is pointed to by qdisc is now a
-	 * rcu structure - it may be accessed without acquiring
-	 * a lock (but the structure may be stale.) The freeing of the
-	 * qdisc will be deferred until it's known that there are no
-	 * more references to it.
-	 *
-	 * If the qdisc has an enqueue function, we still need to
-	 * hold the queue->lock before calling it, since queue->lock
-	 * also serializes access to the device queue.
-	 */
-
 	q = rcu_dereference(txq->qdisc);
+
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
 #endif
 	if (q->enqueue) {
-		/* Grab device queue */
-		spin_lock(&txq->lock);
-		q = txq->qdisc;
-		if (q->enqueue) {
-			rc = q->enqueue(skb, q);
-			qdisc_run(txq);
-			spin_unlock(&txq->lock);
-
-			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
-			goto out;
-		}
-		spin_unlock(&txq->lock);
+		spinlock_t *root_lock = qdisc_root_lock(q);
+
+		spin_lock(root_lock);
+
+		rc = q->enqueue(skb, q);
+		qdisc_run(q);
+
+		spin_unlock(root_lock);
+
+		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
+		goto out;
 	}
 
 	/* The device has no queue. Common case for software devices:
@@ -1974,7 +1960,7 @@ static void net_tx_action(struct softirq_action *h)
 	}
 
 	if (sd->output_queue) {
-		struct netdev_queue *head;
+		struct Qdisc *head;
 
 		local_irq_disable();
 		head = sd->output_queue;
@@ -1982,18 +1968,20 @@ static void net_tx_action(struct softirq_action *h)
 		local_irq_enable();
 
 		while (head) {
-			struct netdev_queue *txq = head;
-			struct net_device *dev = txq->dev;
+			struct Qdisc *q = head;
+			spinlock_t *root_lock;
+
 			head = head->next_sched;
 
 			smp_mb__before_clear_bit();
-			clear_bit(__LINK_STATE_SCHED, &dev->state);
+			clear_bit(__QDISC_STATE_SCHED, &q->state);
 
-			if (spin_trylock(&txq->lock)) {
-				qdisc_run(txq);
-				spin_unlock(&txq->lock);
+			root_lock = qdisc_root_lock(q);
+			if (spin_trylock(root_lock)) {
+				qdisc_run(q);
+				spin_unlock(root_lock);
 			} else {
-				netif_schedule_queue(txq);
+				__netif_schedule(q);
 			}
 		}
 	}
@@ -4459,7 +4447,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 			    void *ocpu)
 {
 	struct sk_buff **list_skb;
-	struct netdev_queue **list_net;
+	struct Qdisc **list_net;
 	struct sk_buff *skb;
 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
 	struct softnet_data *sd, *oldsd;
-- 
cgit v1.2.1


From ead81cc5fc6d996db6afb20f211241612610a07a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 17 Jul 2008 00:50:32 -0700
Subject: netdevice: Move qdisc_list back into net_device proper.

And give it it's own lock.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0b909b74f698..6741e344ac59 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3886,6 +3886,8 @@ int register_netdevice(struct net_device *dev)
 	net = dev_net(dev);
 
 	spin_lock_init(&dev->addr_list_lock);
+	spin_lock_init(&dev->qdisc_list_lock);
+	INIT_LIST_HEAD(&dev->qdisc_list);
 	netdev_init_queue_locks(dev);
 
 	dev->iflink = -1;
-- 
cgit v1.2.1


From 83874000929ed63aef30b44083a9f713135ff040 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 17 Jul 2008 00:53:03 -0700
Subject: pkt_sched: Kill netdev_queue lock.

We can simply use the qdisc->q.lock for all of the
qdisc tree synchronization.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6741e344ac59..32a13772c1cb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2080,10 +2080,12 @@ static int ing_filter(struct sk_buff *skb)
 
 	rxq = &dev->rx_queue;
 
-	spin_lock(&rxq->lock);
-	if ((q = rxq->qdisc) != NULL)
+	q = rxq->qdisc;
+	if (q) {
+		spin_lock(qdisc_lock(q));
 		result = q->enqueue(skb, q);
-	spin_unlock(&rxq->lock);
+		spin_unlock(qdisc_lock(q));
+	}
 
 	return result;
 }
@@ -4173,7 +4175,6 @@ static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue,
 				  void *_unused)
 {
-	spin_lock_init(&queue->lock);
 	queue->dev = dev;
 }
 
-- 
cgit v1.2.1


From 3072367300aa8c779e3a14ee8e89de079e90f3ad Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 18 Jul 2008 22:50:15 -0700
Subject: pkt_sched: Manage qdisc list inside of root qdisc.

Idea is from Patrick McHardy.

Instead of managing the list of qdiscs on the device level, manage it
in the root qdisc of a netdev_queue.  This solves all kinds of
visibility issues during qdisc destruction.

The way to iterate over all qdiscs of a netdev_queue is to visit
the netdev_queue->qdisc, and then traverse it's list.

The only special case is to ignore builting qdiscs at the root when
dumping or doing a qdisc_lookup().  That was not needed previously
because builtin qdiscs were not added to the device's qdisc_list.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index e54acde839da..065b9817e209 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3888,8 +3888,6 @@ int register_netdevice(struct net_device *dev)
 	net = dev_net(dev);
 
 	spin_lock_init(&dev->addr_list_lock);
-	spin_lock_init(&dev->qdisc_list_lock);
-	INIT_LIST_HEAD(&dev->qdisc_list);
 	netdev_init_queue_locks(dev);
 
 	dev->iflink = -1;
-- 
cgit v1.2.1


From 5f86173bdf15981ca49d0434f638b68f70a35644 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sun, 20 Jul 2008 00:08:04 -0700
Subject: net_sched: Add qdisc_enqueue wrapper

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 065b9817e209..2eed17bcb2dd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1781,7 +1781,7 @@ gso:
 
 		spin_lock(root_lock);
 
-		rc = q->enqueue(skb, q);
+		rc = qdisc_enqueue_root(skb, q);
 		qdisc_run(q);
 
 		spin_unlock(root_lock);
@@ -2083,7 +2083,7 @@ static int ing_filter(struct sk_buff *skb)
 	q = rxq->qdisc;
 	if (q) {
 		spin_lock(qdisc_lock(q));
-		result = q->enqueue(skb, q);
+		result = qdisc_enqueue_root(skb, q);
 		spin_unlock(qdisc_lock(q));
 	}
 
-- 
cgit v1.2.1