From 6f6f467eaaa0ac4fb77714d0172d65c781dabb8c Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 30 Aug 2013 06:12:40 -0700
Subject: Drop remaining references to H8/300 architecture

With the architecture gone, any references to it are no longer needed.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/uapi/linux/audit.h  | 1 -
 include/uapi/linux/elf-em.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 75cef3fd97ad..db0b825b4810 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -329,7 +329,6 @@ enum {
 #define AUDIT_ARCH_ARMEB	(EM_ARM)
 #define AUDIT_ARCH_CRIS		(EM_CRIS|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_FRV		(EM_FRV)
-#define AUDIT_ARCH_H8300	(EM_H8_300)
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_IA64		(EM_IA_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_M32R		(EM_M32R)
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 59c17a2d38ad..01529bd96438 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -31,7 +31,6 @@
 #define EM_CRIS		76	/* Axis Communications 32-bit embedded processor */
 #define EM_V850		87	/* NEC v850 */
 #define EM_M32R		88	/* Renesas M32R */
-#define EM_H8_300	46	/* Renesas H8/300,300H,H8S */
 #define EM_MN10300	89	/* Panasonic/MEI MN10300, AM33 */
 #define EM_BLACKFIN     106     /* ADI Blackfin Processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
-- 
cgit v1.2.3


From df62cdf348c91baac61b4cb19d19ea1ef87b271e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Sep 2013 09:10:20 -0700
Subject: net_sched: htb: support of 64bit rates

HTB already can deal with 64bit rates, we only have to add two new
attributes so that tc can use them to break the current 32bit ABI
barrier.

TCA_HTB_RATE64 : class rate  (in bytes per second)
TCA_HTB_CEIL64 : class ceil  (in bytes per second)

This allows us to setup HTB on 40Gbps links, as 32bit limit is
actually ~34Gbps

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_htb.c            | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9b829134d422..f2624b549e61 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -357,6 +357,8 @@ enum {
 	TCA_HTB_CTAB,
 	TCA_HTB_RTAB,
 	TCA_HTB_DIRECT_QLEN,
+	TCA_HTB_RATE64,
+	TCA_HTB_CEIL64,
 	__TCA_HTB_MAX,
 };
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 6b126f6c9957..0e1e38b40025 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -997,6 +997,8 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+	[TCA_HTB_RATE64] = { .type = NLA_U64 },
+	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
 };
 
 static void htb_work_func(struct work_struct *work)
@@ -1114,6 +1116,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 	opt.level = cl->level;
 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
+	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 	spin_unlock_bh(root_lock);
@@ -1332,6 +1340,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
 	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_opt *hopt;
+	u64 rate64, ceil64;
 
 	/* extract all subattrs from opt attr */
 	if (!opt)
@@ -1491,8 +1500,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			cl->prio = TC_HTB_NUMPRIO - 1;
 	}
 
-	psched_ratecfg_precompute(&cl->rate, &hopt->rate, 0);
-	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, 0);
+	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
 
 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
-- 
cgit v1.2.3


From 2485602f1af209aaef3f394ac8336a67cb8742aa Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 18 Aug 2013 22:41:37 +0200
Subject: can: add explicit copyrights to can headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These files are copied to the source code of user space applications (in
this case can-utils) and so it makes sense to mention explicitly their
copyright. I added the terms of C code that was introduced in the same
commit as these headers.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Acked-by: Urs Thuermann <urs.thuermann@volkswagen.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/bcm.h   | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/can/error.h | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/can/gw.h    | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/can/raw.h   | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h
index 3ebe387fea4d..382251a1d214 100644
--- a/include/uapi/linux/can/bcm.h
+++ b/include/uapi/linux/can/bcm.h
@@ -7,6 +7,38 @@
  * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
  * All rights reserved.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
  */
 
 #ifndef CAN_BCM_H
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index 7b7148bded71..b63204545320 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -7,6 +7,38 @@
  * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
  * All rights reserved.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
  */
 
 #ifndef CAN_ERROR_H
diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index 4e27c82b564a..844c8964bdfe 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -7,6 +7,38 @@
  * Copyright (c) 2011 Volkswagen Group Electronic Research
  * All rights reserved.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
  */
 
 #ifndef CAN_GW_H
diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h
index a814062b0719..c7d8c334e0ce 100644
--- a/include/uapi/linux/can/raw.h
+++ b/include/uapi/linux/can/raw.h
@@ -8,6 +8,38 @@
  * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
  * All rights reserved.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
  */
 
 #ifndef CAN_RAW_H
-- 
cgit v1.2.3


From 1c2da13c21a14e9db99c701412ac9069d5b91cf5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 7 Sep 2013 21:34:38 +0200
Subject: can: add explicit copyrights to can's netlink header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This file is copied to the source code of user space applications (in
this case can-utils) and so it makes sense to mention explicitly their
copyright.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/netlink.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 14966ddb7df1..df944ed206a8 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -5,6 +5,14 @@
  *
  * Copyright (c) 2009 Wolfgang Grandegger <wg@grandegger.com>
  *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
  */
 
 #ifndef CAN_NETLINK_H
-- 
cgit v1.2.3


From ad4d35f865408a494f0a4c02b1c7ebd3f80f5dbf Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Thu, 5 Sep 2013 15:55:26 +0800
Subject: [SCSI] csiostor: Use pcie_capability_clear_and_set_word() to simplify
 code

pci_is_pcie() and pcie_capability_clear_and_set_word() make it trivial
to set the PCIe Completion Timeout, so just fold the
csio_set_pcie_completion_timeout() function into its caller.

[bhelgaas: changelog, fold csio_set_pcie_completion_timeout() into caller]
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Naresh Kumar Inna <naresh@chelsio.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jesper Juhl <jj@chaosbits.net>
---
 drivers/scsi/csiostor/csio_hw.c | 22 ++++------------------
 include/uapi/linux/pci_regs.h   |  3 ++-
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c
index 0eb35b9b3784..0eaec4748957 100644
--- a/drivers/scsi/csiostor/csio_hw.c
+++ b/drivers/scsi/csiostor/csio_hw.c
@@ -852,22 +852,6 @@ csio_hw_get_flash_params(struct csio_hw *hw)
 	return 0;
 }
 
-static void
-csio_set_pcie_completion_timeout(struct csio_hw *hw, u8 range)
-{
-	uint16_t val;
-	int pcie_cap;
-
-	if (!csio_pci_capability(hw->pdev, PCI_CAP_ID_EXP, &pcie_cap)) {
-		pci_read_config_word(hw->pdev,
-				     pcie_cap + PCI_EXP_DEVCTL2, &val);
-		val &= 0xfff0;
-		val |= range ;
-		pci_write_config_word(hw->pdev,
-				      pcie_cap + PCI_EXP_DEVCTL2, val);
-	}
-}
-
 /*****************************************************************************/
 /* HW State machine assists                                                  */
 /*****************************************************************************/
@@ -2069,8 +2053,10 @@ csio_hw_configure(struct csio_hw *hw)
 		goto out;
 	}
 
-	/* Set pci completion timeout value to 4 seconds. */
-	csio_set_pcie_completion_timeout(hw, 0xd);
+	/* Set PCIe completion timeout to 4 seconds */
+	if (pci_is_pcie(hw->pdev))
+		pcie_capability_clear_and_set_word(hw->pdev, PCI_EXP_DEVCTL2,
+				PCI_EXP_DEVCTL2_COMP_TIMEOUT, 0xd);
 
 	hw->chip_ops->chip_set_mem_win(hw, MEMWIN_CSIOSTOR);
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index baa7852468ef..1a38377a0032 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -558,7 +558,8 @@
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
-#define  PCI_EXP_DEVCTL2_ARI		0x20	/* Alternative Routing-ID */
+#define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
+#define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
 #define  PCI_EXP_DEVCTL2_IDO_REQ_EN	0x0100	/* Allow IDO for requests */
 #define  PCI_EXP_DEVCTL2_IDO_CMP_EN	0x0200	/* Allow IDO for completions */
 #define  PCI_EXP_DEVCTL2_LTR_EN		0x0400	/* Enable LTR mechanism */
-- 
cgit v1.2.3


From f36f8c75ae2e7d4da34f4c908cebdb4aa42c977e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Sep 2013 10:35:19 +0100
Subject: KEYS: Add per-user_namespace registers for persistent per-UID
 kerberos caches

Add support for per-user_namespace registers of persistent per-UID kerberos
caches held within the kernel.

This allows the kerberos cache to be retained beyond the life of all a user's
processes so that the user's cron jobs can work.

The kerberos cache is envisioned as a keyring/key tree looking something like:

	struct user_namespace
	  \___ .krb_cache keyring		- The register
		\___ _krb.0 keyring		- Root's Kerberos cache
		\___ _krb.5000 keyring		- User 5000's Kerberos cache
		\___ _krb.5001 keyring		- User 5001's Kerberos cache
			\___ tkt785 big_key	- A ccache blob
			\___ tkt12345 big_key	- Another ccache blob

Or possibly:

	struct user_namespace
	  \___ .krb_cache keyring		- The register
		\___ _krb.0 keyring		- Root's Kerberos cache
		\___ _krb.5000 keyring		- User 5000's Kerberos cache
		\___ _krb.5001 keyring		- User 5001's Kerberos cache
			\___ tkt785 keyring	- A ccache
				\___ krbtgt/REDHAT.COM@REDHAT.COM big_key
				\___ http/REDHAT.COM@REDHAT.COM user
				\___ afs/REDHAT.COM@REDHAT.COM user
				\___ nfs/REDHAT.COM@REDHAT.COM user
				\___ krbtgt/KERNEL.ORG@KERNEL.ORG big_key
				\___ http/KERNEL.ORG@KERNEL.ORG big_key

What goes into a particular Kerberos cache is entirely up to userspace.  Kernel
support is limited to giving you the Kerberos cache keyring that you want.

The user asks for their Kerberos cache by:

	krb_cache = keyctl_get_krbcache(uid, dest_keyring);

The uid is -1 or the user's own UID for the user's own cache or the uid of some
other user's cache (requires CAP_SETUID).  This permits rpc.gssd or whatever to
mess with the cache.

The cache returned is a keyring named "_krb.<uid>" that the possessor can read,
search, clear, invalidate, unlink from and add links to.  Active LSMs get a
chance to rule on whether the caller is permitted to make a link.

Each uid's cache keyring is created when it first accessed and is given a
timeout that is extended each time this function is called so that the keyring
goes away after a while.  The timeout is configurable by sysctl but defaults to
three days.

Each user_namespace struct gets a lazily-created keyring that serves as the
register.  The cache keyrings are added to it.  This means that standard key
search and garbage collection facilities are available.

The user_namespace struct's register goes away when it does and anything left
in it is then automatically gc'd.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Simo Sorce <simo@redhat.com>
cc: Serge E. Hallyn <serge.hallyn@ubuntu.com>
cc: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |   6 ++
 include/uapi/linux/keyctl.h    |   1 +
 kernel/user.c                  |   4 +
 kernel/user_namespace.c        |   6 ++
 security/keys/Kconfig          |  17 +++++
 security/keys/Makefile         |   1 +
 security/keys/compat.c         |   3 +
 security/keys/internal.h       |   9 +++
 security/keys/keyctl.c         |   3 +
 security/keys/persistent.c     | 169 +++++++++++++++++++++++++++++++++++++++++
 security/keys/sysctl.c         |  11 +++
 11 files changed, 230 insertions(+)
 create mode 100644 security/keys/persistent.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4db29859464f..4836ba3c1cd8 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -27,6 +27,12 @@ struct user_namespace {
 	kuid_t			owner;
 	kgid_t			group;
 	unsigned int		proc_inum;
+
+	/* Register of per-UID persistent keyrings for this namespace */
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+	struct key		*persistent_keyring_register;
+	struct rw_semaphore	persistent_keyring_register_sem;
+#endif
 };
 
 extern struct user_namespace init_user_ns;
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index c9b7f4faf97a..840cb990abe2 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -56,5 +56,6 @@
 #define KEYCTL_REJECT			19	/* reject a partially constructed key */
 #define KEYCTL_INSTANTIATE_IOV		20	/* instantiate a partially constructed key */
 #define KEYCTL_INVALIDATE		21	/* invalidate a key */
+#define KEYCTL_GET_PERSISTENT		22	/* get a user's persistent keyring */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/kernel/user.c b/kernel/user.c
index 5bbb91988e69..a3a0dbfda329 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,10 @@ struct user_namespace init_user_ns = {
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
 	.proc_inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_KEYS_KERBEROS_CACHE
+	.krb_cache_register_sem =
+	__RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 13fb1134ba58..240fb62cf394 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,6 +101,9 @@ int create_user_ns(struct cred *new)
 
 	set_cred_user_ns(new, ns);
 
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+	init_rwsem(&ns->persistent_keyring_register_sem);
+#endif
 	return 0;
 }
 
@@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)
 
 	do {
 		parent = ns->parent;
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+		key_put(ns->persistent_keyring_register);
+#endif
 		proc_free_inum(ns->proc_inum);
 		kmem_cache_free(user_ns_cachep, ns);
 		ns = parent;
diff --git a/security/keys/Kconfig b/security/keys/Kconfig
index b56362275ec8..53d8748c9564 100644
--- a/security/keys/Kconfig
+++ b/security/keys/Kconfig
@@ -20,6 +20,23 @@ config KEYS
 
 	  If you are unsure as to whether this is required, answer N.
 
+config PERSISTENT_KEYRINGS
+	bool "Enable register of persistent per-UID keyrings"
+	depends on KEYS
+	help
+	  This option provides a register of persistent per-UID keyrings,
+	  primarily aimed at Kerberos key storage.  The keyrings are persistent
+	  in the sense that they stay around after all processes of that UID
+	  have exited, not that they survive the machine being rebooted.
+
+	  A particular keyring may be accessed by either the user whose keyring
+	  it is or by a process with administrative privileges.  The active
+	  LSMs gets to rule on which admin-level processes get to access the
+	  cache.
+
+	  Keyrings are created and added into the register upon demand and get
+	  removed if they expire (a default timeout is set upon creation).
+
 config BIG_KEYS
 	tristate "Large payload keys"
 	depends on KEYS
diff --git a/security/keys/Makefile b/security/keys/Makefile
index c487c77a00be..dfb3a7bededf 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -18,6 +18,7 @@ obj-y := \
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSCTL) += sysctl.o
+obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o
 
 #
 # Key types
diff --git a/security/keys/compat.c b/security/keys/compat.c
index d65fa7fa29ba..bbd32c729dbb 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -138,6 +138,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_INVALIDATE:
 		return keyctl_invalidate_key(arg2);
 
+	case KEYCTL_GET_PERSISTENT:
+		return keyctl_get_persistent(arg2, arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 581c6f688352..80b2aac4f50c 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -255,6 +255,15 @@ extern long keyctl_invalidate_key(key_serial_t);
 extern long keyctl_instantiate_key_common(key_serial_t,
 					  const struct iovec *,
 					  unsigned, size_t, key_serial_t);
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+extern long keyctl_get_persistent(uid_t, key_serial_t);
+extern unsigned persistent_keyring_expiry;
+#else
+static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 
 /*
  * Debugging key validation
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 33cfd27b4de2..cee72ce64222 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1667,6 +1667,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case KEYCTL_INVALIDATE:
 		return keyctl_invalidate_key((key_serial_t) arg2);
 
+	case KEYCTL_GET_PERSISTENT:
+		return keyctl_get_persistent((uid_t)arg2, (key_serial_t)arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
new file mode 100644
index 000000000000..82f4957a7acf
--- /dev/null
+++ b/security/keys/persistent.c
@@ -0,0 +1,169 @@
+/* General persistent per-UID keyrings register
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/user_namespace.h>
+#include "internal.h"
+
+unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
+
+/*
+ * Create the persistent keyring register for the current user namespace.
+ *
+ * Called with the namespace's sem locked for writing.
+ */
+static int key_create_persistent_register(struct user_namespace *ns)
+{
+	struct key *reg = keyring_alloc(".persistent_register",
+					KUIDT_INIT(0), KGIDT_INIT(0),
+					current_cred(),
+					((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+					 KEY_USR_VIEW | KEY_USR_READ),
+					KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(reg))
+		return PTR_ERR(reg);
+
+	ns->persistent_keyring_register = reg;
+	return 0;
+}
+
+/*
+ * Create the persistent keyring for the specified user.
+ *
+ * Called with the namespace's sem locked for writing.
+ */
+static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
+				       struct keyring_index_key *index_key)
+{
+	struct key *persistent;
+	key_ref_t reg_ref, persistent_ref;
+
+	if (!ns->persistent_keyring_register) {
+		long err = key_create_persistent_register(ns);
+		if (err < 0)
+			return ERR_PTR(err);
+	} else {
+		reg_ref = make_key_ref(ns->persistent_keyring_register, true);
+		persistent_ref = find_key_to_update(reg_ref, index_key);
+		if (persistent_ref)
+			return persistent_ref;
+	}
+
+	persistent = keyring_alloc(index_key->description,
+				   uid, INVALID_GID, current_cred(),
+				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				    KEY_USR_VIEW | KEY_USR_READ),
+				   KEY_ALLOC_NOT_IN_QUOTA,
+				   ns->persistent_keyring_register);
+	if (IS_ERR(persistent))
+		return ERR_CAST(persistent);
+
+	return make_key_ref(persistent, true);
+}
+
+/*
+ * Get the persistent keyring for a specific UID and link it to the nominated
+ * keyring.
+ */
+static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
+			       key_ref_t dest_ref)
+{
+	struct keyring_index_key index_key;
+	struct key *persistent;
+	key_ref_t reg_ref, persistent_ref;
+	char buf[32];
+	long ret;
+
+	/* Look in the register if it exists */
+	index_key.type = &key_type_keyring;
+	index_key.description = buf;
+	index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid));
+
+	if (ns->persistent_keyring_register) {
+		reg_ref = make_key_ref(ns->persistent_keyring_register, true);
+		down_read(&ns->persistent_keyring_register_sem);
+		persistent_ref = find_key_to_update(reg_ref, &index_key);
+		up_read(&ns->persistent_keyring_register_sem);
+
+		if (persistent_ref)
+			goto found;
+	}
+
+	/* It wasn't in the register, so we'll need to create it.  We might
+	 * also need to create the register.
+	 */
+	down_write(&ns->persistent_keyring_register_sem);
+	persistent_ref = key_create_persistent(ns, uid, &index_key);
+	up_write(&ns->persistent_keyring_register_sem);
+	if (!IS_ERR(persistent_ref))
+		goto found;
+
+	return PTR_ERR(persistent_ref);
+
+found:
+	ret = key_task_permission(persistent_ref, current_cred(), KEY_LINK);
+	if (ret == 0) {
+		persistent = key_ref_to_ptr(persistent_ref);
+		ret = key_link(key_ref_to_ptr(dest_ref), persistent);
+		if (ret == 0) {
+			key_set_timeout(persistent, persistent_keyring_expiry);
+			ret = persistent->serial;		
+		}
+	}
+
+	key_ref_put(persistent_ref);
+	return ret;
+}
+
+/*
+ * Get the persistent keyring for a specific UID and link it to the nominated
+ * keyring.
+ */
+long keyctl_get_persistent(uid_t _uid, key_serial_t destid)
+{
+	struct user_namespace *ns = current_user_ns();
+	key_ref_t dest_ref;
+	kuid_t uid;
+	long ret;
+
+	/* -1 indicates the current user */
+	if (_uid == (uid_t)-1) {
+		uid = current_uid();
+	} else {
+		uid = make_kuid(ns, _uid);
+		if (!uid_valid(uid))
+			return -EINVAL;
+
+		/* You can only see your own persistent cache if you're not
+		 * sufficiently privileged.
+		 */
+		if (uid_eq(uid, current_uid()) &&
+		    uid_eq(uid, current_suid()) &&
+		    uid_eq(uid, current_euid()) &&
+		    uid_eq(uid, current_fsuid()) &&
+		    !ns_capable(ns, CAP_SETUID))
+			return -EPERM;
+	}
+
+	/* There must be a destination keyring */
+	dest_ref = lookup_user_key(destid, KEY_LOOKUP_CREATE, KEY_WRITE);
+	if (IS_ERR(dest_ref))
+		return PTR_ERR(dest_ref);
+	if (key_ref_to_ptr(dest_ref)->type != &key_type_keyring) {
+		ret = -ENOTDIR;
+		goto out_put_dest;
+	}
+
+	ret = key_get_persistent(ns, uid, dest_ref);
+
+out_put_dest:
+	key_ref_put(dest_ref);
+	return ret;
+}
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index ee32d181764a..8c0af08760c8 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -61,5 +61,16 @@ ctl_table key_sysctls[] = {
 		.extra1 = (void *) &zero,
 		.extra2 = (void *) &max,
 	},
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+	{
+		.procname = "persistent_keyring_expiry",
+		.data = &persistent_keyring_expiry,
+		.maxlen = sizeof(unsigned),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = (void *) &zero,
+		.extra2 = (void *) &max,
+	},
+#endif
 	{ }
 };
-- 
cgit v1.2.3


From 72b70b6ec4fa7da86a3ac0aacee699b18d94fc3b Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 28 Aug 2013 00:39:48 +0200
Subject: NFC: Define secure element IO API and commands

In order to send and receive ISO7816 APDUs to and from NFC embedded
secure elements, we define a specific netlink command.
On a typical SE use case, host applications will send very few APDUs
(Less than 10) per transaction. This is why we decided to go for a
simple netlink API. Defining another NFC socket protocol for such low
traffic would have been overengineered.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h    | 5 +++++
 include/uapi/linux/nfc.h | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 5329804ebb70..82fc4e43fc6e 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -53,6 +53,8 @@ struct nfc_dev;
 typedef void (*data_exchange_cb_t)(void *context, struct sk_buff *skb,
 								int err);
 
+typedef void (*se_io_cb_t)(void *context, u8 *apdu, size_t apdu_len, int err);
+
 struct nfc_target;
 
 struct nfc_ops {
@@ -79,6 +81,9 @@ struct nfc_ops {
 	int (*discover_se)(struct nfc_dev *dev);
 	int (*enable_se)(struct nfc_dev *dev, u32 se_idx);
 	int (*disable_se)(struct nfc_dev *dev, u32 se_idx);
+	int (*se_io) (struct nfc_dev *dev, u32 se_idx,
+		      u8 *apdu, size_t apdu_length,
+		      se_io_cb_t cb, void *cb_context);
 };
 
 #define NFC_TARGET_IDX_ANY -1
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 29bed72a4ac4..6ad6cc03ccd3 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -85,6 +85,7 @@
  *	a specific SE notifies us about the end of a transaction. The parameter
  *	for this event is the application ID (AID).
  * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller.
+ * @NFC_CMD_SE_IO: Send/Receive APDUs to/from the selected secure element.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -114,6 +115,7 @@ enum nfc_commands {
 	NFC_EVENT_SE_CONNECTIVITY,
 	NFC_EVENT_SE_TRANSACTION,
 	NFC_CMD_GET_SE,
+	NFC_CMD_SE_IO,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
@@ -147,6 +149,7 @@ enum nfc_commands {
  * @NFC_ATTR_SE_INDEX: Secure element index
  * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED)
  * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status
+ * @NFC_ATTR_APDU: Secure element APDU
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -174,6 +177,7 @@ enum nfc_attrs {
 	NFC_ATTR_SE_TYPE,
 	NFC_ATTR_SE_AID,
 	NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS,
+	NFC_ATTR_SE_APDU,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
-- 
cgit v1.2.3


From 3a6a9201897c6482573ad07ee880574147761006 Mon Sep 17 00:00:00 2001
From: Sudeep Dutt <sudeep.dutt@intel.com>
Date: Thu, 5 Sep 2013 16:41:55 -0700
Subject: Intel MIC Host Driver, card OS state management.

This patch enables the following features:
a) Boots and shuts down the card via sysfs entries.
b) Allocates and maps a device page for communication with the
   card driver and updates the device page address via scratchpad
   registers.
c) Provides sysfs entries for shutdown status, kernel command line,
   ramdisk and log buffer information.

Co-author: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Caz Yokoyama <Caz.Yokoyama@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Harshavardhan R Kharche <harshavardhan.r.kharche@intel.com>
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Acked-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Reviewed-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-mic.txt | 113 ++++++++
 drivers/misc/mic/common/mic_device.h          |   7 +
 drivers/misc/mic/host/Makefile                |   2 +
 drivers/misc/mic/host/mic_boot.c              | 184 +++++++++++++
 drivers/misc/mic/host/mic_debugfs.c           | 355 +++++++++++++++++++++++++
 drivers/misc/mic/host/mic_device.h            |  60 +++++
 drivers/misc/mic/host/mic_main.c              | 129 ++++++++-
 drivers/misc/mic/host/mic_sysfs.c             | 369 ++++++++++++++++++++++++++
 drivers/misc/mic/host/mic_x100.c              | 251 ++++++++++++++++++
 drivers/misc/mic/host/mic_x100.h              |  12 +
 include/uapi/linux/Kbuild                     |   1 +
 include/uapi/linux/mic_common.h               |  74 ++++++
 12 files changed, 1553 insertions(+), 4 deletions(-)
 create mode 100644 drivers/misc/mic/host/mic_boot.c
 create mode 100644 drivers/misc/mic/host/mic_debugfs.c
 create mode 100644 include/uapi/linux/mic_common.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-mic.txt b/Documentation/ABI/testing/sysfs-class-mic.txt
index 09eb3c6d0237..82cdad3b614a 100644
--- a/Documentation/ABI/testing/sysfs-class-mic.txt
+++ b/Documentation/ABI/testing/sysfs-class-mic.txt
@@ -32,3 +32,116 @@ Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		Provides information about the silicon stepping for an Intel
 		MIC device. For example - "A0" or "B0"
+
+What:		/sys/class/mic/mic(x)/state
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		When read, this entry provides the current state of an Intel
+		MIC device in the context of the card OS. Possible values that
+		will be read are:
+		"offline" - The MIC device is ready to boot the card OS.
+		"online" - The MIC device has initiated booting a card OS.
+		"shutting_down" - The card OS is shutting down.
+		"reset_failed" - The MIC device has failed to reset.
+
+		When written, this sysfs entry triggers different state change
+		operations depending upon the current state of the card OS.
+		Acceptable values are:
+		"boot" - Boot the card OS image specified by the combination
+			 of firmware, ramdisk, cmdline and bootmode
+			sysfs entries.
+		"reset" - Initiates device reset.
+		"shutdown" - Initiates card OS shutdown.
+
+What:		/sys/class/mic/mic(x)/shutdown_status
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		An Intel MIC device runs a Linux OS during its operation. This
+		OS can shutdown because of various reasons. When read, this
+		entry provides the status on why the card OS was shutdown.
+		Possible values are:
+		"nop" -  shutdown status is not applicable, when the card OS is
+			"online"
+		"crashed" - Shutdown because of a HW or SW crash.
+		"halted" - Shutdown because of a halt command.
+		"poweroff" - Shutdown because of a poweroff command.
+		"restart" - Shutdown because of a restart command.
+
+What:		/sys/class/mic/mic(x)/cmdline
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		An Intel MIC device runs a Linux OS during its operation. Before
+		booting this card OS, it is possible to pass kernel command line
+		options to configure various features in it, similar to
+		self-bootable machines. When read, this entry provides
+		information about the current kernel command line options set to
+		boot the card OS. This entry can be written to change the
+		existing kernel command line options. Typically, the user would
+		want to read the current command line options, append new ones
+		or modify existing ones and then write the whole kernel command
+		line back to this entry.
+
+What:		/sys/class/mic/mic(x)/firmware
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		When read, this sysfs entry provides the path name under
+		/lib/firmware/ where the firmware image to be booted on the
+		card can be found. The entry can be written to change the
+		firmware image location under /lib/firmware/.
+
+What:		/sys/class/mic/mic(x)/ramdisk
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		When read, this sysfs entry provides the path name under
+		/lib/firmware/ where the ramdisk image to be used during card
+		OS boot can be found. The entry can be written to change
+		the ramdisk image location under /lib/firmware/.
+
+What:		/sys/class/mic/mic(x)/bootmode
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		When read, this sysfs entry provides the current bootmode for
+		the card. This sysfs entry can be written with the following
+		valid strings:
+		a) linux - Boot a Linux image.
+		b) elf - Boot an elf image for flash updates.
+
+What:		/sys/class/mic/mic(x)/log_buf_addr
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		An Intel MIC device runs a Linux OS during its operation. For
+		debugging purpose and early kernel boot messages, the user can
+		access the card OS log buffer via debugfs. When read, this entry
+		provides the kernel virtual address of the buffer where the card
+		OS log buffer can be read. This entry is written by the host
+		configuration daemon to set the log buffer address. The correct
+		log buffer address to be written can be found in the System.map
+		file of the card OS.
+
+What:		/sys/class/mic/mic(x)/log_buf_len
+Date:		August 2013
+KernelVersion:	3.11
+Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
+Description:
+		An Intel MIC device runs a Linux OS during its operation. For
+		debugging purpose and early kernel boot messages, the user can
+		access the card OS log buffer via debugfs. When read, this entry
+		provides the kernel virtual address where the card OS log buffer
+		length can be read. This entry is written by host configuration
+		daemon to set the log buffer length address. The correct log
+		buffer length address to be written can be found in the
+		System.map file of the card OS.
diff --git a/drivers/misc/mic/common/mic_device.h b/drivers/misc/mic/common/mic_device.h
index f02262e1c9d3..6440e9d58d3a 100644
--- a/drivers/misc/mic/common/mic_device.h
+++ b/drivers/misc/mic/common/mic_device.h
@@ -34,4 +34,11 @@ struct mic_mw {
 	resource_size_t len;
 };
 
+/*
+ * Scratch pad register offsets used by the host to communicate
+ * device page DMA address to the card.
+ */
+#define MIC_DPLO_SPAD 14
+#define MIC_DPHI_SPAD 15
+
 #endif
diff --git a/drivers/misc/mic/host/Makefile b/drivers/misc/mic/host/Makefile
index 6ff5550f10ac..a375dd32c3e2 100644
--- a/drivers/misc/mic/host/Makefile
+++ b/drivers/misc/mic/host/Makefile
@@ -8,3 +8,5 @@ mic_host-objs += mic_x100.o
 mic_host-objs += mic_sysfs.o
 mic_host-objs += mic_smpt.o
 mic_host-objs += mic_intr.o
+mic_host-objs += mic_boot.o
+mic_host-objs += mic_debugfs.o
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
new file mode 100644
index 000000000000..936fc58084f3
--- /dev/null
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -0,0 +1,184 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+
+#include <linux/mic_common.h>
+#include "../common/mic_device.h"
+#include "mic_device.h"
+#include "mic_smpt.h"
+
+/**
+ * mic_reset - Reset the MIC device.
+ * @mdev: pointer to mic_device instance
+ */
+static void mic_reset(struct mic_device *mdev)
+{
+	int i;
+
+#define MIC_RESET_TO (45)
+
+	mdev->ops->reset_fw_ready(mdev);
+	mdev->ops->reset(mdev);
+
+	for (i = 0; i < MIC_RESET_TO; i++) {
+		if (mdev->ops->is_fw_ready(mdev))
+			return;
+		/*
+		 * Resets typically take 10s of seconds to complete.
+		 * Since an MMIO read is required to check if the
+		 * firmware is ready or not, a 1 second delay works nicely.
+		 */
+		msleep(1000);
+	}
+	mic_set_state(mdev, MIC_RESET_FAILED);
+}
+
+/* Initialize the MIC bootparams */
+void mic_bootparam_init(struct mic_device *mdev)
+{
+	struct mic_bootparam *bootparam = mdev->dp;
+
+	bootparam->magic = MIC_MAGIC;
+	bootparam->c2h_shutdown_db = mdev->shutdown_db;
+	bootparam->h2c_shutdown_db = -1;
+	bootparam->h2c_config_db = -1;
+	bootparam->shutdown_status = 0;
+	bootparam->shutdown_card = 0;
+}
+
+/**
+ * mic_start - Start the MIC.
+ * @mdev: pointer to mic_device instance
+ * @buf: buffer containing boot string including firmware/ramdisk path.
+ *
+ * This function prepares an MIC for boot and initiates boot.
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int mic_start(struct mic_device *mdev, const char *buf)
+{
+	int rc;
+	mutex_lock(&mdev->mic_mutex);
+retry:
+	if (MIC_OFFLINE != mdev->state) {
+		rc = -EINVAL;
+		goto unlock_ret;
+	}
+	if (!mdev->ops->is_fw_ready(mdev)) {
+		mic_reset(mdev);
+		/*
+		 * The state will either be MIC_OFFLINE if the reset succeeded
+		 * or MIC_RESET_FAILED if the firmware reset failed.
+		 */
+		goto retry;
+	}
+	rc = mdev->ops->load_mic_fw(mdev, buf);
+	if (rc)
+		goto unlock_ret;
+	mic_smpt_restore(mdev);
+	mic_intr_restore(mdev);
+	mdev->intr_ops->enable_interrupts(mdev);
+	mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr);
+	mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32);
+	mdev->ops->send_firmware_intr(mdev);
+	mic_set_state(mdev, MIC_ONLINE);
+unlock_ret:
+	mutex_unlock(&mdev->mic_mutex);
+	return rc;
+}
+
+/**
+ * mic_stop - Prepare the MIC for reset and trigger reset.
+ * @mdev: pointer to mic_device instance
+ * @force: force a MIC to reset even if it is already offline.
+ *
+ * RETURNS: None.
+ */
+void mic_stop(struct mic_device *mdev, bool force)
+{
+	mutex_lock(&mdev->mic_mutex);
+	if (MIC_OFFLINE != mdev->state || force) {
+		mic_bootparam_init(mdev);
+		mic_reset(mdev);
+		if (MIC_RESET_FAILED == mdev->state)
+			goto unlock;
+		mic_set_shutdown_status(mdev, MIC_NOP);
+		mic_set_state(mdev, MIC_OFFLINE);
+	}
+unlock:
+	mutex_unlock(&mdev->mic_mutex);
+}
+
+/**
+ * mic_shutdown - Initiate MIC shutdown.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: None.
+ */
+void mic_shutdown(struct mic_device *mdev)
+{
+	struct mic_bootparam *bootparam = mdev->dp;
+	s8 db = bootparam->h2c_shutdown_db;
+
+	mutex_lock(&mdev->mic_mutex);
+	if (MIC_ONLINE == mdev->state && db != -1) {
+		bootparam->shutdown_card = 1;
+		mdev->ops->send_intr(mdev, db);
+		mic_set_state(mdev, MIC_SHUTTING_DOWN);
+	}
+	mutex_unlock(&mdev->mic_mutex);
+}
+
+/**
+ * mic_shutdown_work - Handle shutdown interrupt from MIC.
+ * @work: The work structure.
+ *
+ * This work is scheduled whenever the host has received a shutdown
+ * interrupt from the MIC.
+ */
+void mic_shutdown_work(struct work_struct *work)
+{
+	struct mic_device *mdev = container_of(work, struct mic_device,
+			shutdown_work);
+	struct mic_bootparam *bootparam = mdev->dp;
+
+	mutex_lock(&mdev->mic_mutex);
+	mic_set_shutdown_status(mdev, bootparam->shutdown_status);
+	bootparam->shutdown_status = 0;
+	if (MIC_SHUTTING_DOWN != mdev->state)
+		mic_set_state(mdev, MIC_SHUTTING_DOWN);
+	mutex_unlock(&mdev->mic_mutex);
+}
+
+/**
+ * mic_reset_trigger_work - Trigger MIC reset.
+ * @work: The work structure.
+ *
+ * This work is scheduled whenever the host wants to reset the MIC.
+ */
+void mic_reset_trigger_work(struct work_struct *work)
+{
+	struct mic_device *mdev = container_of(work, struct mic_device,
+			reset_trigger_work);
+
+	mic_stop(mdev, false);
+}
diff --git a/drivers/misc/mic/host/mic_debugfs.c b/drivers/misc/mic/host/mic_debugfs.c
new file mode 100644
index 000000000000..78541d42aaf9
--- /dev/null
+++ b/drivers/misc/mic/host/mic_debugfs.c
@@ -0,0 +1,355 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+
+#include <linux/mic_common.h>
+#include "../common/mic_device.h"
+#include "mic_device.h"
+#include "mic_smpt.h"
+
+/* Debugfs parent dir */
+static struct dentry *mic_dbg;
+
+/**
+ * mic_log_buf_show - Display MIC kernel log buffer.
+ *
+ * log_buf addr/len is read from System.map by user space
+ * and populated in sysfs entries.
+ */
+static int mic_log_buf_show(struct seq_file *s, void *unused)
+{
+	void __iomem *log_buf_va;
+	int __iomem *log_buf_len_va;
+	struct mic_device *mdev = s->private;
+	void *kva;
+	int size;
+	unsigned long aper_offset;
+
+	if (!mdev || !mdev->log_buf_addr || !mdev->log_buf_len)
+		goto done;
+	/*
+	 * Card kernel will never be relocated and any kernel text/data mapping
+	 * can be translated to phys address by subtracting __START_KERNEL_map.
+	 */
+	aper_offset = (unsigned long)mdev->log_buf_len - __START_KERNEL_map;
+	log_buf_len_va = mdev->aper.va + aper_offset;
+	aper_offset = (unsigned long)mdev->log_buf_addr - __START_KERNEL_map;
+	log_buf_va = mdev->aper.va + aper_offset;
+	size = ioread32(log_buf_len_va);
+
+	kva = kmalloc(size, GFP_KERNEL);
+	if (!kva)
+		goto done;
+	mutex_lock(&mdev->mic_mutex);
+	memcpy_fromio(kva, log_buf_va, size);
+	switch (mdev->state) {
+	case MIC_ONLINE:
+		/* Fall through */
+	case MIC_SHUTTING_DOWN:
+		seq_write(s, kva, size);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&mdev->mic_mutex);
+	kfree(kva);
+done:
+	return 0;
+}
+
+static int mic_log_buf_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_log_buf_show, inode->i_private);
+}
+
+static int mic_log_buf_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations log_buf_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_log_buf_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_log_buf_release
+};
+
+static int mic_smpt_show(struct seq_file *s, void *pos)
+{
+	int i;
+	struct mic_device *mdev = s->private;
+	unsigned long flags;
+
+	seq_printf(s, "MIC %-2d |%-10s| %-14s %-10s\n",
+		mdev->id, "SMPT entry", "SW DMA addr", "RefCount");
+	seq_puts(s, "====================================================\n");
+
+	if (mdev->smpt) {
+		struct mic_smpt_info *smpt_info = mdev->smpt;
+		spin_lock_irqsave(&smpt_info->smpt_lock, flags);
+		for (i = 0; i < smpt_info->info.num_reg; i++) {
+			seq_printf(s, "%9s|%-10d| %-#14llx %-10lld\n",
+				" ",  i, smpt_info->entry[i].dma_addr,
+				smpt_info->entry[i].ref_count);
+		}
+		spin_unlock_irqrestore(&smpt_info->smpt_lock, flags);
+	}
+	seq_puts(s, "====================================================\n");
+	return 0;
+}
+
+static int mic_smpt_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_smpt_show, inode->i_private);
+}
+
+static int mic_smpt_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations smpt_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_smpt_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_smpt_debug_release
+};
+
+static int mic_soft_reset_show(struct seq_file *s, void *pos)
+{
+	struct mic_device *mdev = s->private;
+
+	mic_stop(mdev, true);
+	return 0;
+}
+
+static int mic_soft_reset_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_soft_reset_show, inode->i_private);
+}
+
+static int mic_soft_reset_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations soft_reset_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_soft_reset_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_soft_reset_debug_release
+};
+
+static int mic_post_code_show(struct seq_file *s, void *pos)
+{
+	struct mic_device *mdev = s->private;
+	u32 reg = mdev->ops->get_postcode(mdev);
+
+	seq_printf(s, "%c%c", reg & 0xff, (reg >> 8) & 0xff);
+	return 0;
+}
+
+static int mic_post_code_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_post_code_show, inode->i_private);
+}
+
+static int mic_post_code_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations post_code_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_post_code_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_post_code_debug_release
+};
+
+static int mic_dp_show(struct seq_file *s, void *pos)
+{
+	struct mic_device *mdev = s->private;
+	struct mic_bootparam *bootparam = mdev->dp;
+
+	seq_printf(s, "Bootparam: magic 0x%x\n",
+		bootparam->magic);
+	seq_printf(s, "Bootparam: h2c_shutdown_db %d\n",
+		bootparam->h2c_shutdown_db);
+	seq_printf(s, "Bootparam: h2c_config_db %d\n",
+		bootparam->h2c_config_db);
+	seq_printf(s, "Bootparam: c2h_shutdown_db %d\n",
+		bootparam->c2h_shutdown_db);
+	seq_printf(s, "Bootparam: shutdown_status %d\n",
+		bootparam->shutdown_status);
+	seq_printf(s, "Bootparam: shutdown_card %d\n",
+		bootparam->shutdown_card);
+
+	return 0;
+}
+
+static int mic_dp_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_dp_show, inode->i_private);
+}
+
+static int mic_dp_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations dp_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_dp_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_dp_debug_release
+};
+
+static int mic_msi_irq_info_show(struct seq_file *s, void *pos)
+{
+	struct mic_device *mdev  = s->private;
+	int reg;
+	int i, j;
+	u16 entry;
+	u16 vector;
+	struct pci_dev *pdev = container_of(mdev->sdev->parent,
+		struct pci_dev, dev);
+
+	if (pci_dev_msi_enabled(pdev)) {
+		for (i = 0; i < mdev->irq_info.num_vectors; i++) {
+			if (pdev->msix_enabled) {
+				entry = mdev->irq_info.msix_entries[i].entry;
+				vector = mdev->irq_info.msix_entries[i].vector;
+			} else {
+				entry = 0;
+				vector = pdev->irq;
+			}
+
+			reg = mdev->intr_ops->read_msi_to_src_map(mdev, entry);
+
+			seq_printf(s, "%s %-10d %s %-10d MXAR[%d]: %08X\n",
+				"IRQ:", vector, "Entry:", entry, i, reg);
+
+			seq_printf(s, "%-10s", "offset:");
+			for (j = (MIC_NUM_OFFSETS - 1); j >= 0; j--)
+				seq_printf(s, "%4d ", j);
+			seq_puts(s, "\n");
+
+
+			seq_printf(s, "%-10s", "count:");
+			for (j = (MIC_NUM_OFFSETS - 1); j >= 0; j--)
+				seq_printf(s, "%4d ",
+				(mdev->irq_info.mic_msi_map[i] & BIT(j)) ?
+					1 : 0);
+			seq_puts(s, "\n\n");
+		}
+	} else {
+		seq_puts(s, "MSI/MSIx interrupts not enabled\n");
+	}
+
+	return 0;
+
+}
+
+static int mic_msi_irq_info_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_msi_irq_info_show, inode->i_private);
+}
+
+static int
+mic_msi_irq_info_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations msi_irq_info_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_msi_irq_info_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_msi_irq_info_debug_release
+};
+
+/**
+ * mic_create_debug_dir - Initialize MIC debugfs entries.
+ */
+void mic_create_debug_dir(struct mic_device *mdev)
+{
+	if (!mic_dbg)
+		return;
+
+	mdev->dbg_dir = debugfs_create_dir(dev_name(mdev->sdev), mic_dbg);
+	if (!mdev->dbg_dir)
+		return;
+
+	debugfs_create_file("log_buf", 0444, mdev->dbg_dir,
+		mdev, &log_buf_ops);
+
+	debugfs_create_file("smpt", 0444, mdev->dbg_dir,
+		mdev, &smpt_file_ops);
+
+	debugfs_create_file("soft_reset", 0444, mdev->dbg_dir,
+		mdev, &soft_reset_ops);
+
+	debugfs_create_file("post_code", 0444, mdev->dbg_dir,
+		mdev, &post_code_ops);
+
+	debugfs_create_file("dp", 0444, mdev->dbg_dir,
+		mdev, &dp_ops);
+
+	debugfs_create_file("msi_irq_info", 0444, mdev->dbg_dir,
+		mdev, &msi_irq_info_ops);
+}
+
+/**
+ * mic_delete_debug_dir - Uninitialize MIC debugfs entries.
+ */
+void mic_delete_debug_dir(struct mic_device *mdev)
+{
+	if (!mdev->dbg_dir)
+		return;
+
+	debugfs_remove_recursive(mdev->dbg_dir);
+}
+
+/**
+ * mic_init_debugfs - Initialize global debugfs entry.
+ */
+void __init mic_init_debugfs(void)
+{
+	mic_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (!mic_dbg)
+		pr_err("can't create debugfs dir\n");
+}
+
+/**
+ * mic_exit_debugfs - Uninitialize global debugfs entry
+ */
+void mic_exit_debugfs(void)
+{
+	debugfs_remove(mic_dbg);
+}
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index dd34b6532a01..50b8b88d70d3 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -63,6 +63,23 @@ enum mic_stepping {
  * @smpt: MIC SMPT information.
  * @intr_info: H/W specific interrupt information.
  * @irq_info: The OS specific irq information
+ * @dbg_dir: debugfs directory of this MIC device.
+ * @cmdline: Kernel command line.
+ * @firmware: Firmware file name.
+ * @ramdisk: Ramdisk file name.
+ * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates.
+ * @bootaddr: MIC boot address.
+ * @reset_trigger_work: Work for triggering reset requests.
+ * @shutdown_work: Work for handling shutdown interrupts.
+ * @state: MIC state.
+ * @shutdown_status: MIC status reported by card for shutdown/crashes.
+ * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes.
+ * @log_buf_addr: Log buffer address for MIC.
+ * @log_buf_len: Log buffer length address for MIC.
+ * @dp: virtio device page
+ * @dp_dma_addr: virtio device page DMA address.
+ * @shutdown_db: shutdown doorbell.
+ * @shutdown_cookie: shutdown cookie.
  */
 struct mic_device {
 	struct mic_mw mmio;
@@ -79,6 +96,23 @@ struct mic_device {
 	struct mic_smpt_info *smpt;
 	struct mic_intr_info *intr_info;
 	struct mic_irq_info irq_info;
+	struct dentry *dbg_dir;
+	char *cmdline;
+	char *firmware;
+	char *ramdisk;
+	char *bootmode;
+	u32 bootaddr;
+	struct work_struct reset_trigger_work;
+	struct work_struct shutdown_work;
+	u8 state;
+	u8 shutdown_status;
+	struct sysfs_dirent *state_sysfs;
+	void *log_buf_addr;
+	int *log_buf_len;
+	void *dp;
+	dma_addr_t dp_dma_addr;
+	int shutdown_db;
+	struct mic_irq *shutdown_cookie;
 };
 
 /**
@@ -90,6 +124,13 @@ struct mic_device {
  * @send_intr: Send an interrupt for a particular doorbell on the card.
  * @ack_interrupt: Hardware specific operations to ack the h/w on
  * receipt of an interrupt.
+ * @reset: Reset the remote processor.
+ * @reset_fw_ready: Reset firmware ready field.
+ * @is_fw_ready: Check if firmware is ready for OS download.
+ * @send_firmware_intr: Send an interrupt to the card firmware.
+ * @load_mic_fw: Load firmware segments required to boot the card
+ * into card memory. This includes the kernel, command line, ramdisk etc.
+ * @get_postcode: Get post code status from firmware.
  */
 struct mic_hw_ops {
 	u8 aper_bar;
@@ -98,6 +139,12 @@ struct mic_hw_ops {
 	void (*write_spad)(struct mic_device *mdev, unsigned int idx, u32 val);
 	void (*send_intr)(struct mic_device *mdev, int doorbell);
 	u32 (*ack_interrupt)(struct mic_device *mdev);
+	void (*reset)(struct mic_device *mdev);
+	void (*reset_fw_ready)(struct mic_device *mdev);
+	bool (*is_fw_ready)(struct mic_device *mdev);
+	void (*send_firmware_intr)(struct mic_device *mdev);
+	int (*load_mic_fw)(struct mic_device *mdev, const char *buf);
+	u32 (*get_postcode)(struct mic_device *mdev);
 };
 
 /**
@@ -127,4 +174,17 @@ mic_mmio_write(struct mic_mw *mw, u32 val, u32 offset)
 }
 
 void mic_sysfs_init(struct mic_device *mdev);
+int mic_start(struct mic_device *mdev, const char *buf);
+void mic_stop(struct mic_device *mdev, bool force);
+void mic_shutdown(struct mic_device *mdev);
+void mic_reset_delayed_work(struct work_struct *work);
+void mic_reset_trigger_work(struct work_struct *work);
+void mic_shutdown_work(struct work_struct *work);
+void mic_bootparam_init(struct mic_device *mdev);
+void mic_set_state(struct mic_device *mdev, u8 state);
+void mic_set_shutdown_status(struct mic_device *mdev, u8 status);
+void mic_create_debug_dir(struct mic_device *dev);
+void mic_delete_debug_dir(struct mic_device *dev);
+void __init mic_init_debugfs(void);
+void mic_exit_debugfs(void);
 #endif
diff --git a/drivers/misc/mic/host/mic_main.c b/drivers/misc/mic/host/mic_main.c
index 332a15e4215b..998a20ab7e96 100644
--- a/drivers/misc/mic/host/mic_main.c
+++ b/drivers/misc/mic/host/mic_main.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 
+#include <linux/mic_common.h>
 #include "../common/mic_device.h"
 #include "mic_device.h"
 #include "mic_x100.h"
@@ -63,6 +64,60 @@ static struct class *g_mic_class;
 /* Base device node number for MIC devices */
 static dev_t g_mic_devno;
 
+/* Initialize the device page */
+static int mic_dp_init(struct mic_device *mdev)
+{
+	mdev->dp = kzalloc(MIC_DP_SIZE, GFP_KERNEL);
+	if (!mdev->dp) {
+		dev_err(mdev->sdev->parent, "%s %d err %d\n",
+			__func__, __LINE__, -ENOMEM);
+		return -ENOMEM;
+	}
+
+	mdev->dp_dma_addr = mic_map_single(mdev,
+		mdev->dp, MIC_DP_SIZE);
+	if (mic_map_error(mdev->dp_dma_addr)) {
+		kfree(mdev->dp);
+		dev_err(mdev->sdev->parent, "%s %d err %d\n",
+			__func__, __LINE__, -ENOMEM);
+		return -ENOMEM;
+	}
+	mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr);
+	mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32);
+	return 0;
+}
+
+/* Uninitialize the device page */
+static void mic_dp_uninit(struct mic_device *mdev)
+{
+	mic_unmap_single(mdev, mdev->dp_dma_addr, MIC_DP_SIZE);
+	kfree(mdev->dp);
+}
+
+/**
+ * mic_shutdown_db - Shutdown doorbell interrupt handler.
+ */
+static irqreturn_t mic_shutdown_db(int irq, void *data)
+{
+	struct mic_device *mdev = data;
+	struct mic_bootparam *bootparam = mdev->dp;
+
+	mdev->ops->ack_interrupt(mdev);
+
+	switch (bootparam->shutdown_status) {
+	case MIC_HALTED:
+	case MIC_POWER_OFF:
+	case MIC_RESTART:
+		/* Fall through */
+	case MIC_CRASHED:
+		schedule_work(&mdev->shutdown_work);
+		break;
+	default:
+		break;
+	};
+	return IRQ_HANDLED;
+}
+
 /**
  * mic_ops_init: Initialize HW specific operation tables.
  *
@@ -136,6 +191,26 @@ mic_device_init(struct mic_device *mdev, struct pci_dev *pdev)
 	mic_sysfs_init(mdev);
 	mutex_init(&mdev->mic_mutex);
 	mdev->irq_info.next_avail_src = 0;
+	INIT_WORK(&mdev->reset_trigger_work, mic_reset_trigger_work);
+	INIT_WORK(&mdev->shutdown_work, mic_shutdown_work);
+}
+
+/**
+ * mic_device_uninit - Frees resources allocated during mic_device_init(..)
+ *
+ * @mdev: pointer to mic_device instance
+ *
+ * returns none
+ */
+static void mic_device_uninit(struct mic_device *mdev)
+{
+	/* The cmdline sysfs entry might have allocated cmdline */
+	kfree(mdev->cmdline);
+	kfree(mdev->firmware);
+	kfree(mdev->ramdisk);
+	kfree(mdev->bootmode);
+	flush_work(&mdev->reset_trigger_work);
+	flush_work(&mdev->shutdown_work);
 }
 
 /**
@@ -170,7 +245,7 @@ static int mic_probe(struct pci_dev *pdev,
 	rc = pci_enable_device(pdev);
 	if (rc) {
 		dev_err(&pdev->dev, "failed to enable pci device.\n");
-		goto ida_remove;
+		goto uninit_device;
 	}
 
 	pci_set_master(pdev);
@@ -228,7 +303,40 @@ static int mic_probe(struct pci_dev *pdev,
 			"device_create_with_groups failed rc %d\n", rc);
 		goto smpt_uninit;
 	}
+	mdev->state_sysfs = sysfs_get_dirent(mdev->sdev->kobj.sd,
+		NULL, "state");
+	if (!mdev->state_sysfs) {
+		rc = -ENODEV;
+		dev_err(&pdev->dev, "sysfs_get_dirent failed rc %d\n", rc);
+		goto destroy_device;
+	}
+
+	rc = mic_dp_init(mdev);
+	if (rc) {
+		dev_err(&pdev->dev, "mic_dp_init failed rc %d\n", rc);
+		goto sysfs_put;
+	}
+	mutex_lock(&mdev->mic_mutex);
+
+	mdev->shutdown_db = mic_next_db(mdev);
+	mdev->shutdown_cookie = mic_request_irq(mdev, mic_shutdown_db,
+		"shutdown-interrupt", mdev, mdev->shutdown_db, MIC_INTR_DB);
+	if (IS_ERR(mdev->shutdown_cookie)) {
+		rc = PTR_ERR(mdev->shutdown_cookie);
+		mutex_unlock(&mdev->mic_mutex);
+		goto dp_uninit;
+	}
+	mutex_unlock(&mdev->mic_mutex);
+	mic_bootparam_init(mdev);
+
+	mic_create_debug_dir(mdev);
 	return 0;
+dp_uninit:
+	mic_dp_uninit(mdev);
+sysfs_put:
+	sysfs_put(mdev->state_sysfs);
+destroy_device:
+	device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id));
 smpt_uninit:
 	mic_smpt_uninit(mdev);
 free_interrupts:
@@ -241,7 +349,8 @@ release_regions:
 	pci_release_regions(pdev);
 disable_device:
 	pci_disable_device(pdev);
-ida_remove:
+uninit_device:
+	mic_device_uninit(mdev);
 	ida_simple_remove(&g_mic_ida, mdev->id);
 ida_fail:
 	kfree(mdev);
@@ -265,11 +374,20 @@ static void mic_remove(struct pci_dev *pdev)
 	if (!mdev)
 		return;
 
+	mic_stop(mdev, false);
+	mic_delete_debug_dir(mdev);
+	mutex_lock(&mdev->mic_mutex);
+	mic_free_irq(mdev, mdev->shutdown_cookie, mdev);
+	mutex_unlock(&mdev->mic_mutex);
+	flush_work(&mdev->shutdown_work);
+	mic_dp_uninit(mdev);
+	sysfs_put(mdev->state_sysfs);
 	device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id));
 	mic_smpt_uninit(mdev);
 	mic_free_interrupts(mdev, pdev);
 	iounmap(mdev->mmio.va);
 	iounmap(mdev->aper.va);
+	mic_device_uninit(mdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	ida_simple_remove(&g_mic_ida, mdev->id);
@@ -300,14 +418,16 @@ static int __init mic_init(void)
 		goto cleanup_chrdev;
 	}
 
+	mic_init_debugfs();
 	ida_init(&g_mic_ida);
 	ret = pci_register_driver(&mic_driver);
 	if (ret) {
 		pr_err("pci_register_driver failed ret %d\n", ret);
-		goto class_destroy;
+		goto cleanup_debugfs;
 	}
 	return ret;
-class_destroy:
+cleanup_debugfs:
+	mic_exit_debugfs();
 	class_destroy(g_mic_class);
 cleanup_chrdev:
 	unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS);
@@ -319,6 +439,7 @@ static void __exit mic_exit(void)
 {
 	pci_unregister_driver(&mic_driver);
 	ida_destroy(&g_mic_ida);
+	mic_exit_debugfs();
 	class_destroy(g_mic_class);
 	unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS);
 }
diff --git a/drivers/misc/mic/host/mic_sysfs.c b/drivers/misc/mic/host/mic_sysfs.c
index 972c18255263..aaf849945111 100644
--- a/drivers/misc/mic/host/mic_sysfs.c
+++ b/drivers/misc/mic/host/mic_sysfs.c
@@ -20,9 +20,50 @@
  */
 #include <linux/pci.h>
 
+#include <linux/mic_common.h>
 #include "../common/mic_device.h"
 #include "mic_device.h"
 
+/*
+ * A state-to-string lookup table, for exposing a human readable state
+ * via sysfs. Always keep in sync with enum mic_states
+ */
+static const char * const mic_state_string[] = {
+	[MIC_OFFLINE] = "offline",
+	[MIC_ONLINE] = "online",
+	[MIC_SHUTTING_DOWN] = "shutting_down",
+	[MIC_RESET_FAILED] = "reset_failed",
+};
+
+/*
+ * A shutdown-status-to-string lookup table, for exposing a human
+ * readable state via sysfs. Always keep in sync with enum mic_shutdown_status
+ */
+static const char * const mic_shutdown_status_string[] = {
+	[MIC_NOP] = "nop",
+	[MIC_CRASHED] = "crashed",
+	[MIC_HALTED] = "halted",
+	[MIC_POWER_OFF] = "poweroff",
+	[MIC_RESTART] = "restart",
+};
+
+void mic_set_shutdown_status(struct mic_device *mdev, u8 shutdown_status)
+{
+	dev_dbg(mdev->sdev->parent, "Shutdown Status %s -> %s\n",
+		mic_shutdown_status_string[mdev->shutdown_status],
+		mic_shutdown_status_string[shutdown_status]);
+	mdev->shutdown_status = shutdown_status;
+}
+
+void mic_set_state(struct mic_device *mdev, u8 state)
+{
+	dev_dbg(mdev->sdev->parent, "State %s -> %s\n",
+		mic_state_string[mdev->state],
+		mic_state_string[state]);
+	mdev->state = state;
+	sysfs_notify_dirent(mdev->state_sysfs);
+}
+
 static ssize_t
 mic_show_family(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -75,9 +116,337 @@ mic_show_stepping(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR(stepping, S_IRUGO, mic_show_stepping, NULL);
 
+static ssize_t
+mic_show_state(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev || mdev->state >= MIC_LAST)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+		mic_state_string[mdev->state]);
+}
+
+static ssize_t
+mic_store_state(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int rc = 0;
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	if (!mdev)
+		return -EINVAL;
+	if (sysfs_streq(buf, "boot")) {
+		rc = mic_start(mdev, buf);
+		if (rc) {
+			dev_err(mdev->sdev->parent,
+				"mic_boot failed rc %d\n", rc);
+			count = rc;
+		}
+		goto done;
+	}
+
+	if (sysfs_streq(buf, "reset")) {
+		schedule_work(&mdev->reset_trigger_work);
+		goto done;
+	}
+
+	if (sysfs_streq(buf, "shutdown")) {
+		mic_shutdown(mdev);
+		goto done;
+	}
+
+	count = -EINVAL;
+done:
+	return count;
+}
+static DEVICE_ATTR(state, S_IRUGO|S_IWUSR, mic_show_state, mic_store_state);
+
+static ssize_t mic_show_shutdown_status(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev || mdev->shutdown_status >= MIC_STATUS_LAST)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+		mic_shutdown_status_string[mdev->shutdown_status]);
+}
+static DEVICE_ATTR(shutdown_status, S_IRUGO|S_IWUSR,
+	mic_show_shutdown_status, NULL);
+
+static ssize_t
+mic_show_cmdline(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	char *cmdline;
+
+	if (!mdev)
+		return -EINVAL;
+
+	cmdline = mdev->cmdline;
+
+	if (cmdline)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline);
+	return 0;
+}
+
+static ssize_t
+mic_store_cmdline(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	mutex_lock(&mdev->mic_mutex);
+	kfree(mdev->cmdline);
+
+	mdev->cmdline = kmalloc(count + 1, GFP_KERNEL);
+	if (!mdev->cmdline) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(mdev->cmdline, buf, count);
+
+	if (mdev->cmdline[count - 1] == '\n')
+		mdev->cmdline[count - 1] = '\0';
+	else
+		mdev->cmdline[count] = '\0';
+unlock:
+	mutex_unlock(&mdev->mic_mutex);
+	return count;
+}
+static DEVICE_ATTR(cmdline, S_IRUGO | S_IWUSR,
+	mic_show_cmdline, mic_store_cmdline);
+
+static ssize_t
+mic_show_firmware(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	char *firmware;
+
+	if (!mdev)
+		return -EINVAL;
+
+	firmware = mdev->firmware;
+
+	if (firmware)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", firmware);
+	return 0;
+}
+
+static ssize_t
+mic_store_firmware(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	mutex_lock(&mdev->mic_mutex);
+	kfree(mdev->firmware);
+
+	mdev->firmware = kmalloc(count + 1, GFP_KERNEL);
+	if (!mdev->firmware) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+	strncpy(mdev->firmware, buf, count);
+
+	if (mdev->firmware[count - 1] == '\n')
+		mdev->firmware[count - 1] = '\0';
+	else
+		mdev->firmware[count] = '\0';
+unlock:
+	mutex_unlock(&mdev->mic_mutex);
+	return count;
+}
+static DEVICE_ATTR(firmware, S_IRUGO | S_IWUSR,
+	mic_show_firmware, mic_store_firmware);
+
+static ssize_t
+mic_show_ramdisk(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	char *ramdisk;
+
+	if (!mdev)
+		return -EINVAL;
+
+	ramdisk = mdev->ramdisk;
+
+	if (ramdisk)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk);
+	return 0;
+}
+
+static ssize_t
+mic_store_ramdisk(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	mutex_lock(&mdev->mic_mutex);
+	kfree(mdev->ramdisk);
+
+	mdev->ramdisk = kmalloc(count + 1, GFP_KERNEL);
+	if (!mdev->ramdisk) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(mdev->ramdisk, buf, count);
+
+	if (mdev->ramdisk[count - 1] == '\n')
+		mdev->ramdisk[count - 1] = '\0';
+	else
+		mdev->ramdisk[count] = '\0';
+unlock:
+	mutex_unlock(&mdev->mic_mutex);
+	return count;
+}
+static DEVICE_ATTR(ramdisk, S_IRUGO | S_IWUSR,
+	mic_show_ramdisk, mic_store_ramdisk);
+
+static ssize_t
+mic_show_bootmode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	char *bootmode;
+
+	if (!mdev)
+		return -EINVAL;
+
+	bootmode = mdev->bootmode;
+
+	if (bootmode)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode);
+	return 0;
+}
+
+static ssize_t
+mic_store_bootmode(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "elf"))
+		return -EINVAL;
+
+	mutex_lock(&mdev->mic_mutex);
+	kfree(mdev->bootmode);
+
+	mdev->bootmode = kmalloc(count + 1, GFP_KERNEL);
+	if (!mdev->bootmode) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(mdev->bootmode, buf, count);
+
+	if (mdev->bootmode[count - 1] == '\n')
+		mdev->bootmode[count - 1] = '\0';
+	else
+		mdev->bootmode[count] = '\0';
+unlock:
+	mutex_unlock(&mdev->mic_mutex);
+	return count;
+}
+static DEVICE_ATTR(bootmode, S_IRUGO | S_IWUSR,
+	mic_show_bootmode, mic_store_bootmode);
+
+static ssize_t
+mic_show_log_buf_addr(struct device *dev, struct device_attribute *attr,
+	char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_addr);
+}
+
+static ssize_t
+mic_store_log_buf_addr(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	int ret;
+	unsigned long addr;
+
+	if (!mdev)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 16, &addr);
+	if (ret)
+		goto exit;
+
+	mdev->log_buf_addr = (void *)addr;
+	ret = count;
+exit:
+	return ret;
+}
+static DEVICE_ATTR(log_buf_addr, S_IRUGO | S_IWUSR,
+	mic_show_log_buf_addr, mic_store_log_buf_addr);
+
+static ssize_t
+mic_show_log_buf_len(struct device *dev, struct device_attribute *attr,
+	char *buf)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+
+	if (!mdev)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_len);
+}
+
+static ssize_t
+mic_store_log_buf_len(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct mic_device *mdev = dev_get_drvdata(dev->parent);
+	int ret;
+	unsigned long addr;
+
+	if (!mdev)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 16, &addr);
+	if (ret)
+		goto exit;
+
+	mdev->log_buf_len = (int *)addr;
+	ret = count;
+exit:
+	return ret;
+}
+static DEVICE_ATTR(log_buf_len, S_IRUGO | S_IWUSR,
+	mic_show_log_buf_len, mic_store_log_buf_len);
+
 static struct attribute *mic_default_attrs[] = {
 	&dev_attr_family.attr,
 	&dev_attr_stepping.attr,
+	&dev_attr_state.attr,
+	&dev_attr_shutdown_status.attr,
+	&dev_attr_cmdline.attr,
+	&dev_attr_firmware.attr,
+	&dev_attr_ramdisk.attr,
+	&dev_attr_bootmode.attr,
+	&dev_attr_log_buf_addr.attr,
+	&dev_attr_log_buf_len.attr,
 
 	NULL
 };
diff --git a/drivers/misc/mic/host/mic_x100.c b/drivers/misc/mic/host/mic_x100.c
index b63731691c73..a12ae5c8844d 100644
--- a/drivers/misc/mic/host/mic_x100.c
+++ b/drivers/misc/mic/host/mic_x100.c
@@ -20,6 +20,9 @@
  */
 #include <linux/fs.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/firmware.h>
+#include <linux/delay.h>
 
 #include "../common/mic_device.h"
 #include "mic_device.h"
@@ -256,6 +259,248 @@ mic_x100_program_msi_to_src_map(struct mic_device *mdev,
 	mic_mmio_write(mw, reg, mxar);
 }
 
+/*
+ * mic_x100_reset_fw_ready - Reset Firmware ready status field.
+ * @mdev: pointer to mic_device instance
+ */
+static void mic_x100_reset_fw_ready(struct mic_device *mdev)
+{
+	mdev->ops->write_spad(mdev, MIC_X100_DOWNLOAD_INFO, 0);
+}
+
+/*
+ * mic_x100_is_fw_ready - Check if firmware is ready.
+ * @mdev: pointer to mic_device instance
+ */
+static bool mic_x100_is_fw_ready(struct mic_device *mdev)
+{
+	u32 scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO);
+	return MIC_X100_SPAD2_DOWNLOAD_STATUS(scratch2) ? true : false;
+}
+
+/**
+ * mic_x100_get_apic_id - Get bootstrap APIC ID.
+ * @mdev: pointer to mic_device instance
+ */
+static u32 mic_x100_get_apic_id(struct mic_device *mdev)
+{
+	u32 scratch2 = 0;
+
+	scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO);
+	return MIC_X100_SPAD2_APIC_ID(scratch2);
+}
+
+/**
+ * mic_x100_send_firmware_intr - Send an interrupt to the firmware on MIC.
+ * @mdev: pointer to mic_device instance
+ */
+static void mic_x100_send_firmware_intr(struct mic_device *mdev)
+{
+	u32 apicicr_low;
+	u64 apic_icr_offset = MIC_X100_SBOX_APICICR7;
+	int vector = MIC_X100_BSP_INTERRUPT_VECTOR;
+	struct mic_mw *mw = &mdev->mmio;
+
+	/*
+	 * For MIC we need to make sure we "hit"
+	 * the send_icr bit (13).
+	 */
+	apicicr_low = (vector | (1 << 13));
+
+	mic_mmio_write(mw, mic_x100_get_apic_id(mdev),
+		MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset + 4);
+
+	/* Ensure that the interrupt is ordered w.r.t. previous stores. */
+	wmb();
+	mic_mmio_write(mw, apicicr_low,
+		MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset);
+}
+
+/**
+ * mic_x100_hw_reset - Reset the MIC device.
+ * @mdev: pointer to mic_device instance
+ */
+static void mic_x100_hw_reset(struct mic_device *mdev)
+{
+	u32 reset_reg;
+	u32 rgcr = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_RGCR;
+	struct mic_mw *mw = &mdev->mmio;
+
+	/* Ensure that the reset is ordered w.r.t. previous loads and stores */
+	mb();
+	/* Trigger reset */
+	reset_reg = mic_mmio_read(mw, rgcr);
+	reset_reg |= 0x1;
+	mic_mmio_write(mw, reset_reg, rgcr);
+	/*
+	 * It seems we really want to delay at least 1 second
+	 * after touching reset to prevent a lot of problems.
+	 */
+	msleep(1000);
+}
+
+/**
+ * mic_x100_load_command_line - Load command line to MIC.
+ * @mdev: pointer to mic_device instance
+ * @fw: the firmware image
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+static int
+mic_x100_load_command_line(struct mic_device *mdev, const struct firmware *fw)
+{
+	u32 len = 0;
+	u32 boot_mem;
+	char *buf;
+	void __iomem *cmd_line_va = mdev->aper.va + mdev->bootaddr + fw->size;
+#define CMDLINE_SIZE 2048
+
+	boot_mem = mdev->aper.len >> 20;
+	buf = kzalloc(CMDLINE_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(mdev->sdev->parent,
+			"%s %d allocation failed\n", __func__, __LINE__);
+		return -ENOMEM;
+	}
+	len += snprintf(buf, CMDLINE_SIZE - len,
+		" mem=%dM", boot_mem);
+	if (mdev->cmdline)
+		snprintf(buf + len, CMDLINE_SIZE - len,
+				" %s", mdev->cmdline);
+	memcpy_toio(cmd_line_va, buf, strlen(buf) + 1);
+	kfree(buf);
+	return 0;
+}
+
+/**
+ * mic_x100_load_ramdisk - Load ramdisk to MIC.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+static int
+mic_x100_load_ramdisk(struct mic_device *mdev)
+{
+	const struct firmware *fw;
+	int rc;
+	struct boot_params __iomem *bp = mdev->aper.va + mdev->bootaddr;
+
+	rc = request_firmware(&fw,
+			mdev->ramdisk, mdev->sdev->parent);
+	if (rc < 0) {
+		dev_err(mdev->sdev->parent,
+			"ramdisk request_firmware failed: %d %s\n",
+			rc, mdev->ramdisk);
+		goto error;
+	}
+	/*
+	 * Typically the bootaddr for card OS is 64M
+	 * so copy over the ramdisk @ 128M.
+	 */
+	memcpy_toio(mdev->aper.va + (mdev->bootaddr << 1),
+		fw->data, fw->size);
+	iowrite32(cpu_to_le32(mdev->bootaddr << 1), &bp->hdr.ramdisk_image);
+	iowrite32(cpu_to_le32(fw->size), &bp->hdr.ramdisk_size);
+	release_firmware(fw);
+error:
+	return rc;
+}
+
+/**
+ * mic_x100_get_boot_addr - Get MIC boot address.
+ * @mdev: pointer to mic_device instance
+ *
+ * This function is called during firmware load to determine
+ * the address at which the OS should be downloaded in card
+ * memory i.e. GDDR.
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+static int
+mic_x100_get_boot_addr(struct mic_device *mdev)
+{
+	u32 scratch2, boot_addr;
+	int rc = 0;
+
+	scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO);
+	boot_addr = MIC_X100_SPAD2_DOWNLOAD_ADDR(scratch2);
+	dev_dbg(mdev->sdev->parent, "%s %d boot_addr 0x%x\n",
+		__func__, __LINE__, boot_addr);
+	if (boot_addr > (1 << 31)) {
+		dev_err(mdev->sdev->parent,
+			"incorrect bootaddr 0x%x\n",
+			boot_addr);
+		rc = -EINVAL;
+		goto error;
+	}
+	mdev->bootaddr = boot_addr;
+error:
+	return rc;
+}
+
+/**
+ * mic_x100_load_firmware - Load firmware to MIC.
+ * @mdev: pointer to mic_device instance
+ * @buf: buffer containing boot string including firmware/ramdisk path.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+static int
+mic_x100_load_firmware(struct mic_device *mdev, const char *buf)
+{
+	int rc;
+	const struct firmware *fw;
+
+	rc = mic_x100_get_boot_addr(mdev);
+	if (rc)
+		goto error;
+	/* load OS */
+	rc = request_firmware(&fw, mdev->firmware, mdev->sdev->parent);
+	if (rc < 0) {
+		dev_err(mdev->sdev->parent,
+			"ramdisk request_firmware failed: %d %s\n",
+			rc, mdev->firmware);
+		goto error;
+	}
+	if (mdev->bootaddr > mdev->aper.len - fw->size) {
+		rc = -EINVAL;
+		dev_err(mdev->sdev->parent, "%s %d rc %d bootaddr 0x%x\n",
+			__func__, __LINE__, rc, mdev->bootaddr);
+		release_firmware(fw);
+		goto error;
+	}
+	memcpy_toio(mdev->aper.va + mdev->bootaddr, fw->data, fw->size);
+	mdev->ops->write_spad(mdev, MIC_X100_FW_SIZE, fw->size);
+	if (!strcmp(mdev->bootmode, "elf"))
+		goto done;
+	/* load command line */
+	rc = mic_x100_load_command_line(mdev, fw);
+	if (rc) {
+		dev_err(mdev->sdev->parent, "%s %d rc %d\n",
+			__func__, __LINE__, rc);
+		goto error;
+	}
+	release_firmware(fw);
+	/* load ramdisk */
+	if (mdev->ramdisk)
+		rc = mic_x100_load_ramdisk(mdev);
+error:
+	dev_dbg(mdev->sdev->parent, "%s %d rc %d\n",
+			__func__, __LINE__, rc);
+done:
+	return rc;
+}
+
+/**
+ * mic_x100_get_postcode - Get postcode status from firmware.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: postcode.
+ */
+static u32 mic_x100_get_postcode(struct mic_device *mdev)
+{
+	return mic_mmio_read(&mdev->mmio, MIC_X100_POSTCODE);
+}
+
 /**
  * mic_x100_smpt_set - Update an SMPT entry with a DMA address.
  * @mdev: pointer to mic_device instance
@@ -311,6 +556,12 @@ struct mic_hw_ops mic_x100_ops = {
 	.write_spad = mic_x100_write_spad,
 	.send_intr = mic_x100_send_intr,
 	.ack_interrupt = mic_x100_ack_interrupt,
+	.reset = mic_x100_hw_reset,
+	.reset_fw_ready = mic_x100_reset_fw_ready,
+	.is_fw_ready = mic_x100_is_fw_ready,
+	.send_firmware_intr = mic_x100_send_firmware_intr,
+	.load_mic_fw = mic_x100_load_firmware,
+	.get_postcode = mic_x100_get_postcode,
 };
 
 struct mic_hw_intr_ops mic_x100_intr_ops = {
diff --git a/drivers/misc/mic/host/mic_x100.h b/drivers/misc/mic/host/mic_x100.h
index 642cae9a0041..8b7daa182e54 100644
--- a/drivers/misc/mic/host/mic_x100.h
+++ b/drivers/misc/mic/host/mic_x100.h
@@ -69,6 +69,15 @@
 #define MIC_X100_NUM_SBOX_IRQ 8
 #define MIC_X100_NUM_RDMASR_IRQ 8
 #define MIC_X100_RDMASR_IRQ_BASE 17
+#define MIC_X100_SPAD2_DOWNLOAD_STATUS(x) ((x) & 0x1)
+#define MIC_X100_SPAD2_APIC_ID(x)	(((x) >> 1) & 0x1ff)
+#define MIC_X100_SPAD2_DOWNLOAD_ADDR(x) ((x) & 0xfffff000)
+#define MIC_X100_SBOX_APICICR7 0x0000AA08
+#define MIC_X100_SBOX_RGCR 0x00004010
+#define MIC_X100_SBOX_SDBIC0 0x0000CC90
+#define MIC_X100_DOWNLOAD_INFO 2
+#define MIC_X100_FW_SIZE 5
+#define MIC_X100_POSTCODE 0x242c
 
 static const u16 mic_x100_intr_init[] = {
 		MIC_X100_DOORBELL_IDX_START,
@@ -79,6 +88,9 @@ static const u16 mic_x100_intr_init[] = {
 		MIC_X100_NUM_ERR,
 };
 
+/* Host->Card(bootstrap) Interrupt Vector */
+#define MIC_X100_BSP_INTERRUPT_VECTOR 229
+
 extern struct mic_hw_ops mic_x100_ops;
 extern struct mic_smpt_ops mic_x100_smpt_ops;
 extern struct mic_hw_intr_ops mic_x100_intr_ops;
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 115add2515aa..d37263fbdca7 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -241,6 +241,7 @@ header-y += media.h
 header-y += mei.h
 header-y += mempolicy.h
 header-y += meye.h
+header-y += mic_common.h
 header-y += mii.h
 header-y += minix_fs.h
 header-y += mman.h
diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h
new file mode 100644
index 000000000000..a9091e529183
--- /dev/null
+++ b/include/uapi/linux/mic_common.h
@@ -0,0 +1,74 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC driver.
+ *
+ */
+#ifndef __MIC_COMMON_H_
+#define __MIC_COMMON_H_
+
+#include <linux/types.h>
+
+/**
+ * struct mic_bootparam: Virtio device independent information in device page
+ *
+ * @magic: A magic value used by the card to ensure it can see the host
+ * @c2h_shutdown_db: Card to Host shutdown doorbell set by host
+ * @h2c_shutdown_db: Host to Card shutdown doorbell set by card
+ * @h2c_config_db: Host to Card Virtio config doorbell set by card
+ * @shutdown_status: Card shutdown status set by card
+ * @shutdown_card: Set to 1 by the host when a card shutdown is initiated
+ */
+struct mic_bootparam {
+	__u32 magic;
+	__s8 c2h_shutdown_db;
+	__s8 h2c_shutdown_db;
+	__s8 h2c_config_db;
+	__u8 shutdown_status;
+	__u8 shutdown_card;
+} __aligned(8);
+
+/* Device page size */
+#define MIC_DP_SIZE 4096
+
+#define MIC_MAGIC 0xc0ffee00
+
+/**
+ * enum mic_states - MIC states.
+ */
+enum mic_states {
+	MIC_OFFLINE = 0,
+	MIC_ONLINE,
+	MIC_SHUTTING_DOWN,
+	MIC_RESET_FAILED,
+	MIC_LAST
+};
+
+/**
+ * enum mic_status - MIC status reported by card after
+ * a host or card initiated shutdown or a card crash.
+ */
+enum mic_status {
+	MIC_NOP = 0,
+	MIC_CRASHED,
+	MIC_HALTED,
+	MIC_POWER_OFF,
+	MIC_RESTART,
+	MIC_STATUS_LAST
+};
+
+#endif
-- 
cgit v1.2.3


From f69bcbf3b4c4b333dcd7a48eaf868bf0c88edab5 Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Thu, 5 Sep 2013 16:42:18 -0700
Subject: Intel MIC Host Driver Changes for Virtio Devices.

This patch introduces the host "Virtio over PCIe" interface for
Intel MIC. It allows creating user space backends on the host and instantiating
virtio devices for them on the Intel MIC card. It uses the existing VRINGH
infrastructure in the kernel to access virtio rings from the host. A character
device per MIC is exposed with IOCTL, mmap and poll callbacks. This allows the
user space backend to:
(a) add/remove a virtio device via a device page.
(b) map (R/O) virtio rings and device page to user space.
(c) poll for availability of data.
(d) copy a descriptor or entire descriptor chain to/from the card.
(e) modify virtio configuration.
(f) handle virtio device reset.
The buffers are copied over using CPU copies for this initial patch
and host initiated MIC DMA support is planned for future patches.
The avail and desc virtio rings are in host memory and the used ring
is in card memory to maximize writes across PCIe for performance.

Co-author: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Caz Yokoyama <Caz.Yokoyama@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Harshavardhan R Kharche <harshavardhan.r.kharche@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Acked-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Reviewed-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mic/Kconfig             |   1 +
 drivers/misc/mic/common/mic_device.h |   7 +
 drivers/misc/mic/host/Makefile       |   2 +
 drivers/misc/mic/host/mic_boot.c     |   3 +-
 drivers/misc/mic/host/mic_debugfs.c  | 140 +++++++
 drivers/misc/mic/host/mic_device.h   |   5 +
 drivers/misc/mic/host/mic_fops.c     | 221 +++++++++++
 drivers/misc/mic/host/mic_fops.h     |  32 ++
 drivers/misc/mic/host/mic_main.c     |  26 ++
 drivers/misc/mic/host/mic_virtio.c   | 703 +++++++++++++++++++++++++++++++++++
 drivers/misc/mic/host/mic_virtio.h   | 138 +++++++
 include/uapi/linux/Kbuild            |   1 +
 include/uapi/linux/mic_common.h      | 166 ++++++++-
 include/uapi/linux/mic_ioctl.h       |  74 ++++
 14 files changed, 1517 insertions(+), 2 deletions(-)
 create mode 100644 drivers/misc/mic/host/mic_fops.c
 create mode 100644 drivers/misc/mic/host/mic_fops.h
 create mode 100644 drivers/misc/mic/host/mic_virtio.c
 create mode 100644 drivers/misc/mic/host/mic_virtio.h
 create mode 100644 include/uapi/linux/mic_ioctl.h

(limited to 'include/uapi/linux')

diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig
index 279a2e649059..01f1a4a84c63 100644
--- a/drivers/misc/mic/Kconfig
+++ b/drivers/misc/mic/Kconfig
@@ -3,6 +3,7 @@ comment "Intel MIC Host Driver"
 config INTEL_MIC_HOST
 	tristate "Intel MIC Host Driver"
 	depends on 64BIT && PCI
+	select VHOST_RING
 	default N
 	help
 	  This enables Host Driver support for the Intel Many Integrated
diff --git a/drivers/misc/mic/common/mic_device.h b/drivers/misc/mic/common/mic_device.h
index 6440e9d58d3a..01eb74faae6b 100644
--- a/drivers/misc/mic/common/mic_device.h
+++ b/drivers/misc/mic/common/mic_device.h
@@ -41,4 +41,11 @@ struct mic_mw {
 #define MIC_DPLO_SPAD 14
 #define MIC_DPHI_SPAD 15
 
+/*
+ * These values are supposed to be in the config_change field of the
+ * device page when the host sends a config change interrupt to the card.
+ */
+#define MIC_VIRTIO_PARAM_DEV_REMOVE 0x1
+#define MIC_VIRTIO_PARAM_CONFIG_CHANGED 0x2
+
 #endif
diff --git a/drivers/misc/mic/host/Makefile b/drivers/misc/mic/host/Makefile
index a375dd32c3e2..c2197f999394 100644
--- a/drivers/misc/mic/host/Makefile
+++ b/drivers/misc/mic/host/Makefile
@@ -10,3 +10,5 @@ mic_host-objs += mic_smpt.o
 mic_host-objs += mic_intr.o
 mic_host-objs += mic_boot.o
 mic_host-objs += mic_debugfs.o
+mic_host-objs += mic_fops.o
+mic_host-objs += mic_virtio.o
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index 936fc58084f3..fd9ff6d3784e 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -20,12 +20,12 @@
  */
 #include <linux/delay.h>
 #include <linux/firmware.h>
-#include <linux/interrupt.h>
 
 #include <linux/mic_common.h>
 #include "../common/mic_device.h"
 #include "mic_device.h"
 #include "mic_smpt.h"
+#include "mic_virtio.h"
 
 /**
  * mic_reset - Reset the MIC device.
@@ -117,6 +117,7 @@ void mic_stop(struct mic_device *mdev, bool force)
 {
 	mutex_lock(&mdev->mic_mutex);
 	if (MIC_OFFLINE != mdev->state || force) {
+		mic_virtio_reset_devices(mdev);
 		mic_bootparam_init(mdev);
 		mic_reset(mdev);
 		if (MIC_RESET_FAILED == mdev->state)
diff --git a/drivers/misc/mic/host/mic_debugfs.c b/drivers/misc/mic/host/mic_debugfs.c
index 78541d42aaf9..e22fb7bbbb98 100644
--- a/drivers/misc/mic/host/mic_debugfs.c
+++ b/drivers/misc/mic/host/mic_debugfs.c
@@ -26,6 +26,7 @@
 #include "../common/mic_device.h"
 #include "mic_device.h"
 #include "mic_smpt.h"
+#include "mic_virtio.h"
 
 /* Debugfs parent dir */
 static struct dentry *mic_dbg;
@@ -193,7 +194,13 @@ static const struct file_operations post_code_ops = {
 static int mic_dp_show(struct seq_file *s, void *pos)
 {
 	struct mic_device *mdev = s->private;
+	struct mic_device_desc *d;
+	struct mic_device_ctrl *dc;
+	struct mic_vqconfig *vqconfig;
+	__u32 *features;
+	__u8 *config;
 	struct mic_bootparam *bootparam = mdev->dp;
+	int i, j;
 
 	seq_printf(s, "Bootparam: magic 0x%x\n",
 		bootparam->magic);
@@ -208,6 +215,53 @@ static int mic_dp_show(struct seq_file *s, void *pos)
 	seq_printf(s, "Bootparam: shutdown_card %d\n",
 		bootparam->shutdown_card);
 
+	for (i = sizeof(*bootparam); i < MIC_DP_SIZE;
+	     i += mic_total_desc_size(d)) {
+		d = mdev->dp + i;
+		dc = (void *)d + mic_aligned_desc_size(d);
+
+		/* end of list */
+		if (d->type == 0)
+			break;
+
+		if (d->type == -1)
+			continue;
+
+		seq_printf(s, "Type %d ", d->type);
+		seq_printf(s, "Num VQ %d ", d->num_vq);
+		seq_printf(s, "Feature Len %d\n", d->feature_len);
+		seq_printf(s, "Config Len %d ", d->config_len);
+		seq_printf(s, "Shutdown Status %d\n", d->status);
+
+		for (j = 0; j < d->num_vq; j++) {
+			vqconfig = mic_vq_config(d) + j;
+			seq_printf(s, "vqconfig[%d]: ", j);
+			seq_printf(s, "address 0x%llx ", vqconfig->address);
+			seq_printf(s, "num %d ", vqconfig->num);
+			seq_printf(s, "used address 0x%llx\n",
+				vqconfig->used_address);
+		}
+
+		features = (__u32 *) mic_vq_features(d);
+		seq_printf(s, "Features: Host 0x%x ", features[0]);
+		seq_printf(s, "Guest 0x%x\n", features[1]);
+
+		config = mic_vq_configspace(d);
+		for (j = 0; j < d->config_len; j++)
+			seq_printf(s, "config[%d]=%d\n", j, config[j]);
+
+		seq_puts(s, "Device control:\n");
+		seq_printf(s, "Config Change %d ", dc->config_change);
+		seq_printf(s, "Vdev reset %d\n", dc->vdev_reset);
+		seq_printf(s, "Guest Ack %d ", dc->guest_ack);
+		seq_printf(s, "Host ack %d\n", dc->host_ack);
+		seq_printf(s, "Used address updated %d ",
+			dc->used_address_updated);
+		seq_printf(s, "Vdev 0x%llx\n", dc->vdev);
+		seq_printf(s, "c2h doorbell %d ", dc->c2h_vdev_db);
+		seq_printf(s, "h2c doorbell %d\n", dc->h2c_vdev_db);
+	}
+
 	return 0;
 }
 
@@ -229,6 +283,89 @@ static const struct file_operations dp_ops = {
 	.release = mic_dp_debug_release
 };
 
+static int mic_vdev_info_show(struct seq_file *s, void *unused)
+{
+	struct mic_device *mdev = s->private;
+	struct list_head *pos, *tmp;
+	struct mic_vdev *mvdev;
+	int i, j;
+
+	mutex_lock(&mdev->mic_mutex);
+	list_for_each_safe(pos, tmp, &mdev->vdev_list) {
+		mvdev = list_entry(pos, struct mic_vdev, list);
+		seq_printf(s, "VDEV type %d state %s in %ld out %ld\n",
+			mvdev->virtio_id,
+			mic_vdevup(mvdev) ? "UP" : "DOWN",
+			mvdev->in_bytes,
+			mvdev->out_bytes);
+		for (i = 0; i < MIC_MAX_VRINGS; i++) {
+			struct vring_desc *desc;
+			struct vring_avail *avail;
+			struct vring_used *used;
+			struct mic_vringh *mvr = &mvdev->mvr[i];
+			struct vringh *vrh = &mvr->vrh;
+			int num = vrh->vring.num;
+			if (!num)
+				continue;
+			desc = vrh->vring.desc;
+			seq_printf(s, "vring i %d avail_idx %d",
+				i, mvr->vring.info->avail_idx & (num - 1));
+			seq_printf(s, " vring i %d avail_idx %d\n",
+				i, mvr->vring.info->avail_idx);
+			seq_printf(s, "vrh i %d weak_barriers %d",
+				i, vrh->weak_barriers);
+			seq_printf(s, " last_avail_idx %d last_used_idx %d",
+				vrh->last_avail_idx, vrh->last_used_idx);
+			seq_printf(s, " completed %d\n", vrh->completed);
+			for (j = 0; j < num; j++) {
+				seq_printf(s, "desc[%d] addr 0x%llx len %d",
+					j, desc->addr, desc->len);
+				seq_printf(s, " flags 0x%x next %d\n",
+					desc->flags,
+					desc->next);
+				desc++;
+			}
+			avail = vrh->vring.avail;
+			seq_printf(s, "avail flags 0x%x idx %d\n",
+				avail->flags, avail->idx & (num - 1));
+			seq_printf(s, "avail flags 0x%x idx %d\n",
+				avail->flags, avail->idx);
+			for (j = 0; j < num; j++)
+				seq_printf(s, "avail ring[%d] %d\n",
+					j, avail->ring[j]);
+			used = vrh->vring.used;
+			seq_printf(s, "used flags 0x%x idx %d\n",
+				used->flags, used->idx & (num - 1));
+			seq_printf(s, "used flags 0x%x idx %d\n",
+				used->flags, used->idx);
+			for (j = 0; j < num; j++)
+				seq_printf(s, "used ring[%d] id %d len %d\n",
+					j, used->ring[j].id, used->ring[j].len);
+		}
+	}
+	mutex_unlock(&mdev->mic_mutex);
+
+	return 0;
+}
+
+static int mic_vdev_info_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mic_vdev_info_show, inode->i_private);
+}
+
+static int mic_vdev_info_debug_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations vdev_info_ops = {
+	.owner   = THIS_MODULE,
+	.open    = mic_vdev_info_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = mic_vdev_info_debug_release
+};
+
 static int mic_msi_irq_info_show(struct seq_file *s, void *pos)
 {
 	struct mic_device *mdev  = s->private;
@@ -321,6 +458,9 @@ void mic_create_debug_dir(struct mic_device *mdev)
 	debugfs_create_file("dp", 0444, mdev->dbg_dir,
 		mdev, &dp_ops);
 
+	debugfs_create_file("vdev_info", 0444, mdev->dbg_dir,
+		mdev, &vdev_info_ops);
+
 	debugfs_create_file("msi_irq_info", 0444, mdev->dbg_dir,
 		mdev, &msi_irq_info_ops);
 }
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index 50b8b88d70d3..dcba2a59e77f 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -21,6 +21,7 @@
 #ifndef _MIC_DEVICE_H_
 #define _MIC_DEVICE_H_
 
+#include <linux/cdev.h>
 #include <linux/idr.h>
 
 #include "mic_intr.h"
@@ -80,6 +81,8 @@ enum mic_stepping {
  * @dp_dma_addr: virtio device page DMA address.
  * @shutdown_db: shutdown doorbell.
  * @shutdown_cookie: shutdown cookie.
+ * @cdev: Character device for MIC.
+ * @vdev_list: list of virtio devices.
  */
 struct mic_device {
 	struct mic_mw mmio;
@@ -113,6 +116,8 @@ struct mic_device {
 	dma_addr_t dp_dma_addr;
 	int shutdown_db;
 	struct mic_irq *shutdown_cookie;
+	struct cdev cdev;
+	struct list_head vdev_list;
 };
 
 /**
diff --git a/drivers/misc/mic/host/mic_fops.c b/drivers/misc/mic/host/mic_fops.c
new file mode 100644
index 000000000000..661469ad339d
--- /dev/null
+++ b/drivers/misc/mic/host/mic_fops.c
@@ -0,0 +1,221 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#include <linux/poll.h>
+
+#include <linux/mic_common.h>
+#include "../common/mic_device.h"
+#include "mic_device.h"
+#include "mic_fops.h"
+#include "mic_virtio.h"
+
+int mic_open(struct inode *inode, struct file *f)
+{
+	struct mic_vdev *mvdev;
+	struct mic_device *mdev = container_of(inode->i_cdev,
+		struct mic_device, cdev);
+
+	mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL);
+	if (!mvdev)
+		return -ENOMEM;
+
+	init_waitqueue_head(&mvdev->waitq);
+	INIT_LIST_HEAD(&mvdev->list);
+	mvdev->mdev = mdev;
+	mvdev->virtio_id = -1;
+
+	f->private_data = mvdev;
+	return 0;
+}
+
+int mic_release(struct inode *inode, struct file *f)
+{
+	struct mic_vdev *mvdev = (struct mic_vdev *)f->private_data;
+
+	if (-1 != mvdev->virtio_id)
+		mic_virtio_del_device(mvdev);
+	f->private_data = NULL;
+	kfree(mvdev);
+	return 0;
+}
+
+long mic_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	struct mic_vdev *mvdev = (struct mic_vdev *)f->private_data;
+	void __user *argp = (void __user *)arg;
+	int ret;
+
+	switch (cmd) {
+	case MIC_VIRTIO_ADD_DEVICE:
+	{
+		ret = mic_virtio_add_device(mvdev, argp);
+		if (ret < 0) {
+			dev_err(mic_dev(mvdev),
+				"%s %d errno ret %d\n",
+				__func__, __LINE__, ret);
+			return ret;
+		}
+		break;
+	}
+	case MIC_VIRTIO_COPY_DESC:
+	{
+		struct mic_copy_desc copy;
+
+		ret = mic_vdev_inited(mvdev);
+		if (ret)
+			return ret;
+
+		if (copy_from_user(&copy, argp, sizeof(copy)))
+			return -EFAULT;
+
+		dev_dbg(mic_dev(mvdev),
+			"%s %d === iovcnt 0x%x vr_idx 0x%x update_used %d\n",
+			__func__, __LINE__, copy.iovcnt, copy.vr_idx,
+			copy.update_used);
+
+		ret = mic_virtio_copy_desc(mvdev, &copy);
+		if (ret < 0) {
+			dev_err(mic_dev(mvdev),
+				"%s %d errno ret %d\n",
+				__func__, __LINE__, ret);
+			return ret;
+		}
+		if (copy_to_user(
+			&((struct mic_copy_desc __user *)argp)->out_len,
+			&copy.out_len, sizeof(copy.out_len))) {
+			dev_err(mic_dev(mvdev), "%s %d errno ret %d\n",
+				__func__, __LINE__, -EFAULT);
+			return -EFAULT;
+		}
+		break;
+	}
+	case MIC_VIRTIO_CONFIG_CHANGE:
+	{
+		ret = mic_vdev_inited(mvdev);
+		if (ret)
+			return ret;
+
+		ret = mic_virtio_config_change(mvdev, argp);
+		if (ret < 0) {
+			dev_err(mic_dev(mvdev),
+				"%s %d errno ret %d\n",
+				__func__, __LINE__, ret);
+			return ret;
+		}
+		break;
+	}
+	default:
+		return -ENOIOCTLCMD;
+	};
+	return 0;
+}
+
+/*
+ * We return POLLIN | POLLOUT from poll when new buffers are enqueued, and
+ * not when previously enqueued buffers may be available. This means that
+ * in the card->host (TX) path, when userspace is unblocked by poll it
+ * must drain all available descriptors or it can stall.
+ */
+unsigned int mic_poll(struct file *f, poll_table *wait)
+{
+	struct mic_vdev *mvdev = (struct mic_vdev *)f->private_data;
+	int mask = 0;
+
+	poll_wait(f, &mvdev->waitq, wait);
+
+	if (mic_vdev_inited(mvdev))
+		mask = POLLERR;
+	else if (mvdev->poll_wake) {
+		mvdev->poll_wake = 0;
+		mask = POLLIN | POLLOUT;
+	}
+
+	return mask;
+}
+
+static inline int
+mic_query_offset(struct mic_vdev *mvdev, unsigned long offset,
+	unsigned long *size, unsigned long *pa)
+{
+	struct mic_device *mdev = mvdev->mdev;
+	unsigned long start = MIC_DP_SIZE;
+	int i;
+
+	/*
+	 * MMAP interface is as follows:
+	 * offset				region
+	 * 0x0					virtio device_page
+	 * 0x1000				first vring
+	 * 0x1000 + size of 1st vring		second vring
+	 * ....
+	 */
+	if (!offset) {
+		*pa = virt_to_phys(mdev->dp);
+		*size = MIC_DP_SIZE;
+		return 0;
+	}
+
+	for (i = 0; i < mvdev->dd->num_vq; i++) {
+		struct mic_vringh *mvr = &mvdev->mvr[i];
+		if (offset == start) {
+			*pa = virt_to_phys(mvr->vring.va);
+			*size = mvr->vring.len;
+			return 0;
+		}
+		start += mvr->vring.len;
+	}
+	return -1;
+}
+
+/*
+ * Maps the device page and virtio rings to user space for readonly access.
+ */
+int
+mic_mmap(struct file *f, struct vm_area_struct *vma)
+{
+	struct mic_vdev *mvdev = (struct mic_vdev *)f->private_data;
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long pa, size = vma->vm_end - vma->vm_start, size_rem = size;
+	int i, err;
+
+	err = mic_vdev_inited(mvdev);
+	if (err)
+		return err;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EACCES;
+
+	while (size_rem) {
+		i = mic_query_offset(mvdev, offset, &size, &pa);
+		if (i < 0)
+			return -EINVAL;
+		err = remap_pfn_range(vma, vma->vm_start + offset,
+			pa >> PAGE_SHIFT, size, vma->vm_page_prot);
+		if (err)
+			return err;
+		dev_dbg(mic_dev(mvdev),
+			"%s %d type %d size 0x%lx off 0x%lx pa 0x%lx vma 0x%lx\n",
+			__func__, __LINE__, mvdev->virtio_id, size, offset,
+			pa, vma->vm_start + offset);
+		size_rem -= size;
+		offset += size;
+	}
+	return 0;
+}
diff --git a/drivers/misc/mic/host/mic_fops.h b/drivers/misc/mic/host/mic_fops.h
new file mode 100644
index 000000000000..dc3893dff667
--- /dev/null
+++ b/drivers/misc/mic/host/mic_fops.h
@@ -0,0 +1,32 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#ifndef _MIC_FOPS_H_
+#define _MIC_FOPS_H_
+
+int mic_open(struct inode *inode, struct file *filp);
+int mic_release(struct inode *inode, struct file *filp);
+ssize_t mic_read(struct file *filp, char __user *buf,
+			size_t count, loff_t *pos);
+long mic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+int mic_mmap(struct file *f, struct vm_area_struct *vma);
+unsigned int mic_poll(struct file *f, poll_table *wait);
+
+#endif
diff --git a/drivers/misc/mic/host/mic_main.c b/drivers/misc/mic/host/mic_main.c
index 998a20ab7e96..a8965d496e84 100644
--- a/drivers/misc/mic/host/mic_main.c
+++ b/drivers/misc/mic/host/mic_main.c
@@ -25,12 +25,15 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/poll.h>
 
 #include <linux/mic_common.h>
 #include "../common/mic_device.h"
 #include "mic_device.h"
 #include "mic_x100.h"
 #include "mic_smpt.h"
+#include "mic_fops.h"
+#include "mic_virtio.h"
 
 static const char mic_driver_name[] = "mic";
 
@@ -64,6 +67,15 @@ static struct class *g_mic_class;
 /* Base device node number for MIC devices */
 static dev_t g_mic_devno;
 
+static const struct file_operations mic_fops = {
+	.open = mic_open,
+	.release = mic_release,
+	.unlocked_ioctl = mic_ioctl,
+	.poll = mic_poll,
+	.mmap = mic_mmap,
+	.owner = THIS_MODULE,
+};
+
 /* Initialize the device page */
 static int mic_dp_init(struct mic_device *mdev)
 {
@@ -193,6 +205,7 @@ mic_device_init(struct mic_device *mdev, struct pci_dev *pdev)
 	mdev->irq_info.next_avail_src = 0;
 	INIT_WORK(&mdev->reset_trigger_work, mic_reset_trigger_work);
 	INIT_WORK(&mdev->shutdown_work, mic_shutdown_work);
+	INIT_LIST_HEAD(&mdev->vdev_list);
 }
 
 /**
@@ -330,7 +343,19 @@ static int mic_probe(struct pci_dev *pdev,
 	mic_bootparam_init(mdev);
 
 	mic_create_debug_dir(mdev);
+	cdev_init(&mdev->cdev, &mic_fops);
+	mdev->cdev.owner = THIS_MODULE;
+	rc = cdev_add(&mdev->cdev, MKDEV(MAJOR(g_mic_devno), mdev->id), 1);
+	if (rc) {
+		dev_err(&pdev->dev, "cdev_add err id %d rc %d\n", mdev->id, rc);
+		goto cleanup_debug_dir;
+	}
 	return 0;
+cleanup_debug_dir:
+	mic_delete_debug_dir(mdev);
+	mutex_lock(&mdev->mic_mutex);
+	mic_free_irq(mdev, mdev->shutdown_cookie, mdev);
+	mutex_unlock(&mdev->mic_mutex);
 dp_uninit:
 	mic_dp_uninit(mdev);
 sysfs_put:
@@ -375,6 +400,7 @@ static void mic_remove(struct pci_dev *pdev)
 		return;
 
 	mic_stop(mdev, false);
+	cdev_del(&mdev->cdev);
 	mic_delete_debug_dir(mdev);
 	mutex_lock(&mdev->mic_mutex);
 	mic_free_irq(mdev, mdev->shutdown_cookie, mdev);
diff --git a/drivers/misc/mic/host/mic_virtio.c b/drivers/misc/mic/host/mic_virtio.c
new file mode 100644
index 000000000000..be2a1f06c4ca
--- /dev/null
+++ b/drivers/misc/mic/host/mic_virtio.c
@@ -0,0 +1,703 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <linux/mic_common.h>
+#include "../common/mic_device.h"
+#include "mic_device.h"
+#include "mic_smpt.h"
+#include "mic_virtio.h"
+
+/*
+ * Initiates the copies across the PCIe bus from card memory to
+ * a user space buffer.
+ */
+static int mic_virtio_copy_to_user(struct mic_vdev *mvdev,
+		void __user *ubuf, size_t len, u64 addr)
+{
+	int err;
+	void __iomem *dbuf = mvdev->mdev->aper.va + addr;
+	/*
+	 * We are copying from IO below an should ideally use something
+	 * like copy_to_user_fromio(..) if it existed.
+	 */
+	if (copy_to_user(ubuf, dbuf, len)) {
+		err = -EFAULT;
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto err;
+	}
+	mvdev->in_bytes += len;
+	err = 0;
+err:
+	return err;
+}
+
+/*
+ * Initiates copies across the PCIe bus from a user space
+ * buffer to card memory.
+ */
+static int mic_virtio_copy_from_user(struct mic_vdev *mvdev,
+		void __user *ubuf, size_t len, u64 addr)
+{
+	int err;
+	void __iomem *dbuf = mvdev->mdev->aper.va + addr;
+	/*
+	 * We are copying to IO below and should ideally use something
+	 * like copy_from_user_toio(..) if it existed.
+	 */
+	if (copy_from_user(dbuf, ubuf, len)) {
+		err = -EFAULT;
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto err;
+	}
+	mvdev->out_bytes += len;
+	err = 0;
+err:
+	return err;
+}
+
+#define MIC_VRINGH_READ true
+
+/* The function to call to notify the card about added buffers */
+static void mic_notify(struct vringh *vrh)
+{
+	struct mic_vringh *mvrh = container_of(vrh, struct mic_vringh, vrh);
+	struct mic_vdev *mvdev = mvrh->mvdev;
+	s8 db = mvdev->dc->h2c_vdev_db;
+
+	if (db != -1)
+		mvdev->mdev->ops->send_intr(mvdev->mdev, db);
+}
+
+/* Determine the total number of bytes consumed in a VRINGH KIOV */
+static inline u32 mic_vringh_iov_consumed(struct vringh_kiov *iov)
+{
+	int i;
+	u32 total = iov->consumed;
+
+	for (i = 0; i < iov->i; i++)
+		total += iov->iov[i].iov_len;
+	return total;
+}
+
+/*
+ * Traverse the VRINGH KIOV and issue the APIs to trigger the copies.
+ * This API is heavily based on the vringh_iov_xfer(..) implementation
+ * in vringh.c. The reason we cannot reuse vringh_iov_pull_kern(..)
+ * and vringh_iov_push_kern(..) directly is because there is no
+ * way to override the VRINGH xfer(..) routines as of v3.10.
+ */
+static int mic_vringh_copy(struct mic_vdev *mvdev, struct vringh_kiov *iov,
+	void __user *ubuf, size_t len, bool read, size_t *out_len)
+{
+	int ret = 0;
+	size_t partlen, tot_len = 0;
+
+	while (len && iov->i < iov->used) {
+		partlen = min(iov->iov[iov->i].iov_len, len);
+		if (read)
+			ret = mic_virtio_copy_to_user(mvdev,
+				ubuf, partlen,
+				(u64)iov->iov[iov->i].iov_base);
+		else
+			ret = mic_virtio_copy_from_user(mvdev,
+				ubuf, partlen,
+				(u64)iov->iov[iov->i].iov_base);
+		if (ret) {
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+				__func__, __LINE__, ret);
+			break;
+		}
+		len -= partlen;
+		ubuf += partlen;
+		tot_len += partlen;
+		iov->consumed += partlen;
+		iov->iov[iov->i].iov_len -= partlen;
+		iov->iov[iov->i].iov_base += partlen;
+		if (!iov->iov[iov->i].iov_len) {
+			/* Fix up old iov element then increment. */
+			iov->iov[iov->i].iov_len = iov->consumed;
+			iov->iov[iov->i].iov_base -= iov->consumed;
+
+			iov->consumed = 0;
+			iov->i++;
+		}
+	}
+	*out_len = tot_len;
+	return ret;
+}
+
+/*
+ * Use the standard VRINGH infrastructure in the kernel to fetch new
+ * descriptors, initiate the copies and update the used ring.
+ */
+static int _mic_virtio_copy(struct mic_vdev *mvdev,
+	struct mic_copy_desc *copy)
+{
+	int ret = 0, iovcnt = copy->iovcnt;
+	struct iovec iov;
+	struct iovec __user *u_iov = copy->iov;
+	void __user *ubuf = NULL;
+	struct mic_vringh *mvr = &mvdev->mvr[copy->vr_idx];
+	struct vringh_kiov *riov = &mvr->riov;
+	struct vringh_kiov *wiov = &mvr->wiov;
+	struct vringh *vrh = &mvr->vrh;
+	u16 *head = &mvr->head;
+	struct mic_vring *vr = &mvr->vring;
+	size_t len = 0, out_len;
+
+	copy->out_len = 0;
+	/* Fetch a new IOVEC if all previous elements have been processed */
+	if (riov->i == riov->used && wiov->i == wiov->used) {
+		ret = vringh_getdesc_kern(vrh, riov, wiov,
+				head, GFP_KERNEL);
+		/* Check if there are available descriptors */
+		if (ret <= 0)
+			return ret;
+	}
+	while (iovcnt) {
+		if (!len) {
+			/* Copy over a new iovec from user space. */
+			ret = copy_from_user(&iov, u_iov, sizeof(*u_iov));
+			if (ret) {
+				ret = -EINVAL;
+				dev_err(mic_dev(mvdev), "%s %d err %d\n",
+					__func__, __LINE__, ret);
+				break;
+			}
+			len = iov.iov_len;
+			ubuf = iov.iov_base;
+		}
+		/* Issue all the read descriptors first */
+		ret = mic_vringh_copy(mvdev, riov, ubuf, len,
+			MIC_VRINGH_READ, &out_len);
+		if (ret) {
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+					__func__, __LINE__, ret);
+			break;
+		}
+		len -= out_len;
+		ubuf += out_len;
+		copy->out_len += out_len;
+		/* Issue the write descriptors next */
+		ret = mic_vringh_copy(mvdev, wiov, ubuf, len,
+			!MIC_VRINGH_READ, &out_len);
+		if (ret) {
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+					__func__, __LINE__, ret);
+			break;
+		}
+		len -= out_len;
+		ubuf += out_len;
+		copy->out_len += out_len;
+		if (!len) {
+			/* One user space iovec is now completed */
+			iovcnt--;
+			u_iov++;
+		}
+		/* Exit loop if all elements in KIOVs have been processed. */
+		if (riov->i == riov->used && wiov->i == wiov->used)
+			break;
+	}
+	/*
+	 * Update the used ring if a descriptor was available and some data was
+	 * copied in/out and the user asked for a used ring update.
+	 */
+	if (*head != USHRT_MAX && copy->out_len &&
+		copy->update_used) {
+		u32 total = 0;
+
+		/* Determine the total data consumed */
+		total += mic_vringh_iov_consumed(riov);
+		total += mic_vringh_iov_consumed(wiov);
+		vringh_complete_kern(vrh, *head, total);
+		*head = USHRT_MAX;
+		if (vringh_need_notify_kern(vrh) > 0)
+			vringh_notify(vrh);
+		vringh_kiov_cleanup(riov);
+		vringh_kiov_cleanup(wiov);
+		/* Update avail idx for user space */
+		vr->info->avail_idx = vrh->last_avail_idx;
+	}
+	return ret;
+}
+
+static inline int mic_verify_copy_args(struct mic_vdev *mvdev,
+		struct mic_copy_desc *copy)
+{
+	if (copy->vr_idx >= mvdev->dd->num_vq) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -EINVAL);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Copy a specified number of virtio descriptors in a chain */
+int mic_virtio_copy_desc(struct mic_vdev *mvdev,
+		struct mic_copy_desc *copy)
+{
+	int err;
+	struct mic_vringh *mvr = &mvdev->mvr[copy->vr_idx];
+
+	err = mic_verify_copy_args(mvdev, copy);
+	if (err)
+		return err;
+
+	mutex_lock(&mvr->vr_mutex);
+	if (!mic_vdevup(mvdev)) {
+		err = -ENODEV;
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto err;
+	}
+	err = _mic_virtio_copy(mvdev, copy);
+	if (err) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, err);
+	}
+err:
+	mutex_unlock(&mvr->vr_mutex);
+	return err;
+}
+
+static void mic_virtio_init_post(struct mic_vdev *mvdev)
+{
+	struct mic_vqconfig *vqconfig = mic_vq_config(mvdev->dd);
+	int i;
+
+	for (i = 0; i < mvdev->dd->num_vq; i++) {
+		if (!le64_to_cpu(vqconfig[i].used_address)) {
+			dev_warn(mic_dev(mvdev), "used_address zero??\n");
+			continue;
+		}
+		mvdev->mvr[i].vrh.vring.used =
+			mvdev->mdev->aper.va +
+			le64_to_cpu(vqconfig[i].used_address);
+	}
+
+	mvdev->dc->used_address_updated = 0;
+
+	dev_dbg(mic_dev(mvdev), "%s: device type %d LINKUP\n",
+		__func__, mvdev->virtio_id);
+}
+
+static inline void mic_virtio_device_reset(struct mic_vdev *mvdev)
+{
+	int i;
+
+	dev_dbg(mic_dev(mvdev), "%s: status %d device type %d RESET\n",
+		__func__, mvdev->dd->status, mvdev->virtio_id);
+
+	for (i = 0; i < mvdev->dd->num_vq; i++)
+		/*
+		 * Avoid lockdep false positive. The + 1 is for the mic
+		 * mutex which is held in the reset devices code path.
+		 */
+		mutex_lock_nested(&mvdev->mvr[i].vr_mutex, i + 1);
+
+	/* 0 status means "reset" */
+	mvdev->dd->status = 0;
+	mvdev->dc->vdev_reset = 0;
+	mvdev->dc->host_ack = 1;
+
+	for (i = 0; i < mvdev->dd->num_vq; i++) {
+		struct vringh *vrh = &mvdev->mvr[i].vrh;
+		mvdev->mvr[i].vring.info->avail_idx = 0;
+		vrh->completed = 0;
+		vrh->last_avail_idx = 0;
+		vrh->last_used_idx = 0;
+	}
+
+	for (i = 0; i < mvdev->dd->num_vq; i++)
+		mutex_unlock(&mvdev->mvr[i].vr_mutex);
+}
+
+void mic_virtio_reset_devices(struct mic_device *mdev)
+{
+	struct list_head *pos, *tmp;
+	struct mic_vdev *mvdev;
+
+	dev_dbg(mdev->sdev->parent, "%s\n",  __func__);
+
+	list_for_each_safe(pos, tmp, &mdev->vdev_list) {
+		mvdev = list_entry(pos, struct mic_vdev, list);
+		mic_virtio_device_reset(mvdev);
+		mvdev->poll_wake = 1;
+		wake_up(&mvdev->waitq);
+	}
+}
+
+void mic_bh_handler(struct work_struct *work)
+{
+	struct mic_vdev *mvdev = container_of(work, struct mic_vdev,
+			virtio_bh_work);
+
+	if (mvdev->dc->used_address_updated)
+		mic_virtio_init_post(mvdev);
+
+	if (mvdev->dc->vdev_reset)
+		mic_virtio_device_reset(mvdev);
+
+	mvdev->poll_wake = 1;
+	wake_up(&mvdev->waitq);
+}
+
+static irqreturn_t mic_virtio_intr_handler(int irq, void *data)
+{
+
+	struct mic_vdev *mvdev = data;
+	struct mic_device *mdev = mvdev->mdev;
+
+	mdev->ops->ack_interrupt(mdev);
+	schedule_work(&mvdev->virtio_bh_work);
+	return IRQ_HANDLED;
+}
+
+int mic_virtio_config_change(struct mic_vdev *mvdev,
+			void __user *argp)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);
+	int ret = 0, retry = 100, i;
+	struct mic_bootparam *bootparam = mvdev->mdev->dp;
+	s8 db = bootparam->h2c_config_db;
+
+	mutex_lock(&mvdev->mdev->mic_mutex);
+	for (i = 0; i < mvdev->dd->num_vq; i++)
+		mutex_lock_nested(&mvdev->mvr[i].vr_mutex, i + 1);
+
+	if (db == -1 || mvdev->dd->type == -1) {
+		ret = -EIO;
+		goto exit;
+	}
+
+	if (copy_from_user(mic_vq_configspace(mvdev->dd),
+				argp, mvdev->dd->config_len)) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -EFAULT);
+		ret = -EFAULT;
+		goto exit;
+	}
+	mvdev->dc->config_change = MIC_VIRTIO_PARAM_CONFIG_CHANGED;
+	mvdev->mdev->ops->send_intr(mvdev->mdev, db);
+
+	for (i = retry; i--;) {
+		ret = wait_event_timeout(wake,
+			mvdev->dc->guest_ack, msecs_to_jiffies(100));
+		if (ret)
+			break;
+	}
+
+	dev_dbg(mic_dev(mvdev),
+		"%s %d retry: %d\n", __func__, __LINE__, retry);
+	mvdev->dc->config_change = 0;
+	mvdev->dc->guest_ack = 0;
+exit:
+	for (i = 0; i < mvdev->dd->num_vq; i++)
+		mutex_unlock(&mvdev->mvr[i].vr_mutex);
+	mutex_unlock(&mvdev->mdev->mic_mutex);
+	return ret;
+}
+
+static int mic_copy_dp_entry(struct mic_vdev *mvdev,
+					void __user *argp,
+					__u8 *type,
+					struct mic_device_desc **devpage)
+{
+	struct mic_device *mdev = mvdev->mdev;
+	struct mic_device_desc dd, *dd_config, *devp;
+	struct mic_vqconfig *vqconfig;
+	int ret = 0, i;
+	bool slot_found = false;
+
+	if (copy_from_user(&dd, argp, sizeof(dd))) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -EFAULT);
+		return -EFAULT;
+	}
+
+	if (mic_aligned_desc_size(&dd) > MIC_MAX_DESC_BLK_SIZE
+		|| dd.num_vq > MIC_MAX_VRINGS) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -EINVAL);
+		return -EINVAL;
+	}
+
+	dd_config = kmalloc(mic_desc_size(&dd), GFP_KERNEL);
+	if (dd_config == NULL) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -ENOMEM);
+		return -ENOMEM;
+	}
+	if (copy_from_user(dd_config, argp, mic_desc_size(&dd))) {
+		ret = -EFAULT;
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, ret);
+		goto exit;
+	}
+
+	vqconfig = mic_vq_config(dd_config);
+	for (i = 0; i < dd.num_vq; i++) {
+		if (le16_to_cpu(vqconfig[i].num) > MIC_MAX_VRING_ENTRIES) {
+			ret =  -EINVAL;
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+				__func__, __LINE__, ret);
+			goto exit;
+		}
+	}
+
+	/* Find the first free device page entry */
+	for (i = mic_aligned_size(struct mic_bootparam);
+		i < MIC_DP_SIZE - mic_total_desc_size(dd_config);
+		i += mic_total_desc_size(devp)) {
+		devp = mdev->dp + i;
+		if (devp->type == 0 || devp->type == -1) {
+			slot_found = true;
+			break;
+		}
+	}
+	if (!slot_found) {
+		ret =  -EINVAL;
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, ret);
+		goto exit;
+	}
+	/*
+	 * Save off the type before doing the memcpy. Type will be set in the
+	 * end after completing all initialization for the new device.
+	 */
+	*type = dd_config->type;
+	dd_config->type = 0;
+	memcpy(devp, dd_config, mic_desc_size(dd_config));
+
+	*devpage = devp;
+exit:
+	kfree(dd_config);
+	return ret;
+}
+
+static void mic_init_device_ctrl(struct mic_vdev *mvdev,
+				struct mic_device_desc *devpage)
+{
+	struct mic_device_ctrl *dc;
+
+	dc = mvdev->dc = (void *)devpage + mic_aligned_desc_size(devpage);
+
+	dc->config_change = 0;
+	dc->guest_ack = 0;
+	dc->vdev_reset = 0;
+	dc->host_ack = 0;
+	dc->used_address_updated = 0;
+	dc->c2h_vdev_db = -1;
+	dc->h2c_vdev_db = -1;
+}
+
+int mic_virtio_add_device(struct mic_vdev *mvdev,
+			void __user *argp)
+{
+	struct mic_device *mdev = mvdev->mdev;
+	struct mic_device_desc *dd;
+	struct mic_vqconfig *vqconfig;
+	int vr_size, i, j, ret;
+	u8 type;
+	s8 db;
+	char irqname[10];
+	struct mic_bootparam *bootparam = mdev->dp;
+	u16 num;
+
+	mutex_lock(&mdev->mic_mutex);
+
+	ret = mic_copy_dp_entry(mvdev, argp, &type, &dd);
+	if (ret) {
+		mutex_unlock(&mdev->mic_mutex);
+		return ret;
+	}
+
+	mic_init_device_ctrl(mvdev, dd);
+
+	mvdev->dd = dd;
+	mvdev->virtio_id = type;
+	vqconfig = mic_vq_config(dd);
+	INIT_WORK(&mvdev->virtio_bh_work, mic_bh_handler);
+
+	for (i = 0; i < dd->num_vq; i++) {
+		struct mic_vringh *mvr = &mvdev->mvr[i];
+		struct mic_vring *vr = &mvdev->mvr[i].vring;
+		num = le16_to_cpu(vqconfig[i].num);
+		mutex_init(&mvr->vr_mutex);
+		vr_size = PAGE_ALIGN(vring_size(num, MIC_VIRTIO_RING_ALIGN) +
+			sizeof(struct _mic_vring_info));
+		vr->va = (void *)
+			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+			get_order(vr_size));
+		if (!vr->va) {
+			ret = -ENOMEM;
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+				__func__, __LINE__, ret);
+			goto err;
+		}
+		vr->len = vr_size;
+		vr->info = vr->va + vring_size(num, MIC_VIRTIO_RING_ALIGN);
+		vr->info->magic = MIC_MAGIC + mvdev->virtio_id + i;
+		vqconfig[i].address = mic_map_single(mdev,
+			vr->va, vr_size);
+		if (mic_map_error(vqconfig[i].address)) {
+			free_pages((unsigned long)vr->va,
+				get_order(vr_size));
+			ret = -ENOMEM;
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+				__func__, __LINE__, ret);
+			goto err;
+		}
+		vqconfig[i].address = cpu_to_le64(vqconfig[i].address);
+
+		vring_init(&vr->vr, num,
+			vr->va, MIC_VIRTIO_RING_ALIGN);
+		ret = vringh_init_kern(&mvr->vrh,
+			*(u32 *)mic_vq_features(mvdev->dd), num, false,
+			vr->vr.desc, vr->vr.avail, vr->vr.used);
+		if (ret) {
+			dev_err(mic_dev(mvdev), "%s %d err %d\n",
+				__func__, __LINE__, ret);
+			goto err;
+		}
+		vringh_kiov_init(&mvr->riov, NULL, 0);
+		vringh_kiov_init(&mvr->wiov, NULL, 0);
+		mvr->head = USHRT_MAX;
+		mvr->mvdev = mvdev;
+		mvr->vrh.notify = mic_notify;
+		dev_dbg(mdev->sdev->parent,
+			"%s %d index %d va %p info %p vr_size 0x%x\n",
+			__func__, __LINE__, i, vr->va, vr->info, vr_size);
+	}
+
+	snprintf(irqname, sizeof(irqname),
+		"mic%dvirtio%d", mdev->id, mvdev->virtio_id);
+	mvdev->virtio_db = mic_next_db(mdev);
+	mvdev->virtio_cookie = mic_request_irq(mdev, mic_virtio_intr_handler,
+			irqname, mvdev, mvdev->virtio_db, MIC_INTR_DB);
+	if (IS_ERR(mvdev->virtio_cookie)) {
+		ret = PTR_ERR(mvdev->virtio_cookie);
+		dev_dbg(mdev->sdev->parent, "request irq failed\n");
+		goto err;
+	}
+
+	mvdev->dc->c2h_vdev_db = mvdev->virtio_db;
+
+	list_add_tail(&mvdev->list, &mdev->vdev_list);
+	/*
+	 * Order the type update with previous stores. This write barrier
+	 * is paired with the corresponding read barrier before the uncached
+	 * system memory read of the type, on the card while scanning the
+	 * device page.
+	 */
+	smp_wmb();
+	dd->type = type;
+
+	dev_dbg(mdev->sdev->parent, "Added virtio device id %d\n", dd->type);
+
+	db = bootparam->h2c_config_db;
+	if (db != -1)
+		mdev->ops->send_intr(mdev, db);
+	mutex_unlock(&mdev->mic_mutex);
+	return 0;
+err:
+	vqconfig = mic_vq_config(dd);
+	for (j = 0; j < i; j++) {
+		struct mic_vringh *mvr = &mvdev->mvr[j];
+		mic_unmap_single(mdev, le64_to_cpu(vqconfig[j].address),
+				mvr->vring.len);
+		free_pages((unsigned long)mvr->vring.va,
+			get_order(mvr->vring.len));
+	}
+	mutex_unlock(&mdev->mic_mutex);
+	return ret;
+}
+
+void mic_virtio_del_device(struct mic_vdev *mvdev)
+{
+	struct list_head *pos, *tmp;
+	struct mic_vdev *tmp_mvdev;
+	struct mic_device *mdev = mvdev->mdev;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);
+	int i, ret, retry = 100;
+	struct mic_vqconfig *vqconfig;
+	struct mic_bootparam *bootparam = mdev->dp;
+	s8 db;
+
+	mutex_lock(&mdev->mic_mutex);
+	db = bootparam->h2c_config_db;
+	if (db == -1)
+		goto skip_hot_remove;
+	dev_dbg(mdev->sdev->parent,
+		"Requesting hot remove id %d\n", mvdev->virtio_id);
+	mvdev->dc->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE;
+	mdev->ops->send_intr(mdev, db);
+	for (i = retry; i--;) {
+		ret = wait_event_timeout(wake,
+			mvdev->dc->guest_ack, msecs_to_jiffies(100));
+		if (ret)
+			break;
+	}
+	dev_dbg(mdev->sdev->parent,
+		"Device id %d config_change %d guest_ack %d\n",
+		mvdev->virtio_id, mvdev->dc->config_change,
+		mvdev->dc->guest_ack);
+	mvdev->dc->config_change = 0;
+	mvdev->dc->guest_ack = 0;
+skip_hot_remove:
+	mic_free_irq(mdev, mvdev->virtio_cookie, mvdev);
+	flush_work(&mvdev->virtio_bh_work);
+	vqconfig = mic_vq_config(mvdev->dd);
+	for (i = 0; i < mvdev->dd->num_vq; i++) {
+		struct mic_vringh *mvr = &mvdev->mvr[i];
+		vringh_kiov_cleanup(&mvr->riov);
+		vringh_kiov_cleanup(&mvr->wiov);
+		mic_unmap_single(mdev, le64_to_cpu(vqconfig[i].address),
+				mvr->vring.len);
+		free_pages((unsigned long)mvr->vring.va,
+			get_order(mvr->vring.len));
+	}
+
+	list_for_each_safe(pos, tmp, &mdev->vdev_list) {
+		tmp_mvdev = list_entry(pos, struct mic_vdev, list);
+		if (tmp_mvdev == mvdev) {
+			list_del(pos);
+			dev_dbg(mdev->sdev->parent,
+				"Removing virtio device id %d\n",
+				mvdev->virtio_id);
+			break;
+		}
+	}
+	/*
+	 * Order the type update with previous stores. This write barrier
+	 * is paired with the corresponding read barrier before the uncached
+	 * system memory read of the type, on the card while scanning the
+	 * device page.
+	 */
+	smp_wmb();
+	mvdev->dd->type = -1;
+	mutex_unlock(&mdev->mic_mutex);
+}
diff --git a/drivers/misc/mic/host/mic_virtio.h b/drivers/misc/mic/host/mic_virtio.h
new file mode 100644
index 000000000000..184f3c84805b
--- /dev/null
+++ b/drivers/misc/mic/host/mic_virtio.h
@@ -0,0 +1,138 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#ifndef MIC_VIRTIO_H
+#define MIC_VIRTIO_H
+
+#include <linux/virtio_config.h>
+#include <linux/mic_ioctl.h>
+
+/*
+ * Note on endianness.
+ * 1. Host can be both BE or LE
+ * 2. Guest/card is LE. Host uses le_to_cpu to access desc/avail
+ *    rings and ioreadXX/iowriteXX to access used ring.
+ * 3. Device page exposed by host to guest contains LE values. Guest
+ *    accesses these using ioreadXX/iowriteXX etc. This way in general we
+ *    obey the virtio spec according to which guest works with native
+ *    endianness and host is aware of guest endianness and does all
+ *    required endianness conversion.
+ * 4. Data provided from user space to guest (in ADD_DEVICE and
+ *    CONFIG_CHANGE ioctl's) is not interpreted by the driver and should be
+ *    in guest endianness.
+ */
+
+/**
+ * struct mic_vringh - Virtio ring host information.
+ *
+ * @vring: The MIC vring used for setting up user space mappings.
+ * @vrh: The host VRINGH used for accessing the card vrings.
+ * @riov: The VRINGH read kernel IOV.
+ * @wiov: The VRINGH write kernel IOV.
+ * @head: The VRINGH head index address passed to vringh_getdesc_kern(..).
+ * @vr_mutex: Mutex for synchronizing access to the VRING.
+ * @mvdev: Back pointer to MIC virtio device for vringh_notify(..).
+ */
+struct mic_vringh {
+	struct mic_vring vring;
+	struct vringh vrh;
+	struct vringh_kiov riov;
+	struct vringh_kiov wiov;
+	u16 head;
+	struct mutex vr_mutex;
+	struct mic_vdev *mvdev;
+};
+
+/**
+ * struct mic_vdev - Host information for a card Virtio device.
+ *
+ * @virtio_id - Virtio device id.
+ * @waitq - Waitqueue to allow ring3 apps to poll.
+ * @mdev - Back pointer to host MIC device.
+ * @poll_wake - Used for waking up threads blocked in poll.
+ * @out_bytes - Debug stats for number of bytes copied from host to card.
+ * @in_bytes - Debug stats for number of bytes copied from card to host.
+ * @mvr - Store per VRING data structures.
+ * @virtio_bh_work - Work struct used to schedule virtio bottom half handling.
+ * @dd - Virtio device descriptor.
+ * @dc - Virtio device control fields.
+ * @list - List of Virtio devices.
+ * @virtio_db - The doorbell used by the card to interrupt the host.
+ * @virtio_cookie - The cookie returned while requesting interrupts.
+ */
+struct mic_vdev {
+	int virtio_id;
+	wait_queue_head_t waitq;
+	struct mic_device *mdev;
+	int poll_wake;
+	unsigned long out_bytes;
+	unsigned long in_bytes;
+	struct mic_vringh mvr[MIC_MAX_VRINGS];
+	struct work_struct virtio_bh_work;
+	struct mic_device_desc *dd;
+	struct mic_device_ctrl *dc;
+	struct list_head list;
+	int virtio_db;
+	struct mic_irq *virtio_cookie;
+};
+
+void mic_virtio_uninit(struct mic_device *mdev);
+int mic_virtio_add_device(struct mic_vdev *mvdev,
+			void __user *argp);
+void mic_virtio_del_device(struct mic_vdev *mvdev);
+int mic_virtio_config_change(struct mic_vdev *mvdev,
+			void __user *argp);
+int mic_virtio_copy_desc(struct mic_vdev *mvdev,
+	struct mic_copy_desc *request);
+void mic_virtio_reset_devices(struct mic_device *mdev);
+void mic_bh_handler(struct work_struct *work);
+
+/* Helper API to obtain the MIC PCIe device */
+static inline struct device *mic_dev(struct mic_vdev *mvdev)
+{
+	return mvdev->mdev->sdev->parent;
+}
+
+/* Helper API to check if a virtio device is initialized */
+static inline int mic_vdev_inited(struct mic_vdev *mvdev)
+{
+	/* Device has not been created yet */
+	if (!mvdev->dd || !mvdev->dd->type) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -EINVAL);
+		return -EINVAL;
+	}
+
+	/* Device has been removed/deleted */
+	if (mvdev->dd->type == -1) {
+		dev_err(mic_dev(mvdev), "%s %d err %d\n",
+			__func__, __LINE__, -ENODEV);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/* Helper API to check if a virtio device is running */
+static inline bool mic_vdevup(struct mic_vdev *mvdev)
+{
+	return !!mvdev->dd->status;
+}
+#endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index d37263fbdca7..33d2b8fe166d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -242,6 +242,7 @@ header-y += mei.h
 header-y += mempolicy.h
 header-y += meye.h
 header-y += mic_common.h
+header-y += mic_ioctl.h
 header-y += mii.h
 header-y += minix_fs.h
 header-y += mman.h
diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h
index a9091e529183..364ac751bd04 100644
--- a/include/uapi/linux/mic_common.h
+++ b/include/uapi/linux/mic_common.h
@@ -21,7 +21,60 @@
 #ifndef __MIC_COMMON_H_
 #define __MIC_COMMON_H_
 
-#include <linux/types.h>
+#include <linux/virtio_ring.h>
+
+#ifndef __KERNEL__
+#define ALIGN(a, x)	(((a) + (x) - 1) & ~((x) - 1))
+#define __aligned(x)	__attribute__ ((aligned(x)))
+#endif
+
+#define mic_aligned_size(x) ALIGN(sizeof(x), 8)
+
+/**
+ * struct mic_device_desc: Virtio device information shared between the
+ * virtio driver and userspace backend
+ *
+ * @type: Device type: console/network/disk etc.  Type 0/-1 terminates.
+ * @num_vq: Number of virtqueues.
+ * @feature_len: Number of bytes of feature bits.  Multiply by 2: one for
+   host features and one for guest acknowledgements.
+ * @config_len: Number of bytes of the config array after virtqueues.
+ * @status: A status byte, written by the Guest.
+ * @config: Start of the following variable length config.
+ */
+struct mic_device_desc {
+	__s8 type;
+	__u8 num_vq;
+	__u8 feature_len;
+	__u8 config_len;
+	__u8 status;
+	__u64 config[0];
+} __aligned(8);
+
+/**
+ * struct mic_device_ctrl: Per virtio device information in the device page
+ * used internally by the host and card side drivers.
+ *
+ * @vdev: Used for storing MIC vdev information by the guest.
+ * @config_change: Set to 1 by host when a config change is requested.
+ * @vdev_reset: Set to 1 by guest to indicate virtio device has been reset.
+ * @guest_ack: Set to 1 by guest to ack a command.
+ * @host_ack: Set to 1 by host to ack a command.
+ * @used_address_updated: Set to 1 by guest when the used address should be
+ * updated.
+ * @c2h_vdev_db: The doorbell number to be used by guest. Set by host.
+ * @h2c_vdev_db: The doorbell number to be used by host. Set by guest.
+ */
+struct mic_device_ctrl {
+	__u64 vdev;
+	__u8 config_change;
+	__u8 vdev_reset;
+	__u8 guest_ack;
+	__u8 host_ack;
+	__u8 used_address_updated;
+	__s8 c2h_vdev_db;
+	__s8 h2c_vdev_db;
+} __aligned(8);
 
 /**
  * struct mic_bootparam: Virtio device independent information in device page
@@ -42,6 +95,117 @@ struct mic_bootparam {
 	__u8 shutdown_card;
 } __aligned(8);
 
+/**
+ * struct mic_device_page: High level representation of the device page
+ *
+ * @bootparam: The bootparam structure is used for sharing information and
+ * status updates between MIC host and card drivers.
+ * @desc: Array of MIC virtio device descriptors.
+ */
+struct mic_device_page {
+	struct mic_bootparam bootparam;
+	struct mic_device_desc desc[0];
+};
+/**
+ * struct mic_vqconfig: This is how we expect the device configuration field
+ * for a virtqueue to be laid out in config space.
+ *
+ * @address: Guest/MIC physical address of the virtio ring
+ * (avail and desc rings)
+ * @used_address: Guest/MIC physical address of the used ring
+ * @num: The number of entries in the virtio_ring
+ */
+struct mic_vqconfig {
+	__u64 address;
+	__u64 used_address;
+	__u16 num;
+} __aligned(8);
+
+/*
+ * The alignment to use between consumer and producer parts of vring.
+ * This is pagesize for historical reasons.
+ */
+#define MIC_VIRTIO_RING_ALIGN		4096
+
+#define MIC_MAX_VRINGS			4
+#define MIC_VRING_ENTRIES		128
+
+/*
+ * Max vring entries (power of 2) to ensure desc and avail rings
+ * fit in a single page
+ */
+#define MIC_MAX_VRING_ENTRIES		128
+
+/**
+ * Max size of the desc block in bytes: includes:
+ *	- struct mic_device_desc
+ *	- struct mic_vqconfig (num_vq of these)
+ *	- host and guest features
+ *	- virtio device config space
+ */
+#define MIC_MAX_DESC_BLK_SIZE		256
+
+/**
+ * struct _mic_vring_info - Host vring info exposed to userspace backend
+ * for the avail index and magic for the card.
+ *
+ * @avail_idx: host avail idx
+ * @magic: A magic debug cookie.
+ */
+struct _mic_vring_info {
+	__u16 avail_idx;
+	int magic;
+};
+
+/**
+ * struct mic_vring - Vring information.
+ *
+ * @vr: The virtio ring.
+ * @info: Host vring information exposed to the userspace backend for the
+ * avail index and magic for the card.
+ * @va: The va for the buffer allocated for vr and info.
+ * @len: The length of the buffer required for allocating vr and info.
+ */
+struct mic_vring {
+	struct vring vr;
+	struct _mic_vring_info *info;
+	void *va;
+	int len;
+};
+
+#define mic_aligned_desc_size(d) ALIGN(mic_desc_size(d), 8)
+
+#ifndef INTEL_MIC_CARD
+static inline unsigned mic_desc_size(const struct mic_device_desc *desc)
+{
+	return mic_aligned_size(*desc)
+		+ desc->num_vq * mic_aligned_size(struct mic_vqconfig)
+		+ desc->feature_len * 2
+		+ desc->config_len;
+}
+
+static inline struct mic_vqconfig *
+mic_vq_config(const struct mic_device_desc *desc)
+{
+	return (struct mic_vqconfig *)(desc + 1);
+}
+
+static inline __u8 *mic_vq_features(const struct mic_device_desc *desc)
+{
+	return (__u8 *)(mic_vq_config(desc) + desc->num_vq);
+}
+
+static inline __u8 *mic_vq_configspace(const struct mic_device_desc *desc)
+{
+	return mic_vq_features(desc) + desc->feature_len * 2;
+}
+static inline unsigned mic_total_desc_size(struct mic_device_desc *desc)
+{
+	return mic_aligned_desc_size(desc) +
+		mic_aligned_size(struct mic_device_ctrl);
+}
+#endif
+
 /* Device page size */
 #define MIC_DP_SIZE 4096
 
diff --git a/include/uapi/linux/mic_ioctl.h b/include/uapi/linux/mic_ioctl.h
new file mode 100644
index 000000000000..0e6cbf3e5292
--- /dev/null
+++ b/include/uapi/linux/mic_ioctl.h
@@ -0,0 +1,74 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Host driver.
+ *
+ */
+#ifndef _MIC_IOCTL_H_
+#define _MIC_IOCTL_H_
+
+/*
+ * mic_copy - MIC virtio descriptor copy.
+ *
+ * @iov: An array of IOVEC structures containing user space buffers.
+ * @iovcnt: Number of IOVEC structures in iov.
+ * @vr_idx: The vring index.
+ * @update_used: A non zero value results in used index being updated.
+ * @out_len: The aggregate of the total length written to or read from
+ *	the virtio device.
+ */
+struct mic_copy_desc {
+#ifdef __KERNEL__
+	struct iovec __user *iov;
+#else
+	struct iovec *iov;
+#endif
+	int iovcnt;
+	__u8 vr_idx;
+	__u8 update_used;
+	__u32 out_len;
+};
+
+/*
+ * Add a new virtio device
+ * The (struct mic_device_desc *) pointer points to a device page entry
+ *	for the virtio device consisting of:
+ *	- struct mic_device_desc
+ *	- struct mic_vqconfig (num_vq of these)
+ *	- host and guest features
+ *	- virtio device config space
+ * The total size referenced by the pointer should equal the size returned
+ * by desc_size() in mic_common.h
+ */
+#define MIC_VIRTIO_ADD_DEVICE _IOWR('s', 1, struct mic_device_desc *)
+
+/*
+ * Copy the number of entries in the iovec and update the used index
+ * if requested by the user.
+ */
+#define MIC_VIRTIO_COPY_DESC	_IOWR('s', 2, struct mic_copy_desc *)
+
+/*
+ * Notify virtio device of a config change
+ * The (__u8 *) pointer points to config space values for the device
+ * as they should be written into the device page. The total size
+ * referenced by the pointer should equal the config_len field of struct
+ * mic_device_desc.
+ */
+#define MIC_VIRTIO_CONFIG_CHANGE _IOWR('s', 5, __u8 *)
+
+#endif
-- 
cgit v1.2.3


From 09a2c73ddfc7f173237fc7209a65b34dd5bcb5ed Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Thu, 12 Sep 2013 16:20:00 +0800
Subject: PCI: Remove unused PCI_MSIX_FLAGS_BIRMASK definition

PCI_MSIX_FLAGS_BIRMASK has been replaced by PCI_MSIX_TABLE_BIR for better
readability.  Now no one uses it, remove it.  No functional change.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index baa7852468ef..f0509148111d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -319,7 +319,6 @@
 #define PCI_MSIX_PBA		8	/* Pending Bit Array offset */
 #define  PCI_MSIX_PBA_BIR	0x00000007 /* BAR index */
 #define  PCI_MSIX_PBA_OFFSET	0xfffffff8 /* Offset into specified BAR */
-#define  PCI_MSIX_FLAGS_BIRMASK	(7 << 0)   /* deprecated */
 #define PCI_CAP_MSIX_SIZEOF	12	/* size of MSIX registers */
 
 /* MSI-X entry's format */
-- 
cgit v1.2.3


From b019ba959fc4600a646bb9d732149b1b0056f88a Mon Sep 17 00:00:00 2001
From: Sudeep Dutt <sudeep.dutt@intel.com>
Date: Fri, 27 Sep 2013 09:50:06 -0700
Subject: misc: mic: fix a warning in the IOCTL header file.

The following warning from mic_ioctl.h is fixed via this patch:
found __[us]{8,16,32,64} type without #include <linux/types.h>

Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Harshavardhan R Kharche <harshavardhan.r.kharche@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/mic_ioctl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mic_ioctl.h b/include/uapi/linux/mic_ioctl.h
index 0e6cbf3e5292..7fabba5059cf 100644
--- a/include/uapi/linux/mic_ioctl.h
+++ b/include/uapi/linux/mic_ioctl.h
@@ -21,6 +21,8 @@
 #ifndef _MIC_IOCTL_H_
 #define _MIC_IOCTL_H_
 
+#include <linux/types.h>
+
 /*
  * mic_copy - MIC virtio descriptor copy.
  *
-- 
cgit v1.2.3


From 5e04c0c38c90f1f11a0e87800e4c22d4aba1d733 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 30 Sep 2013 07:57:18 +0200
Subject: netfilter: ipset: Introduce new operation to get both setname and
 family

ip[6]tables set match and SET target need to know the family of the set
in order to reject adding rules which refer to a set with a non-mathcing
family. Currently such rules are silently accepted and then ignored
instead of generating a clear error message to the user, which is not
helpful.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/uapi/linux/netfilter/ipset/ip_set.h |  8 ++++++++
 net/netfilter/ipset/ip_set_core.c           | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 8024cdf13b70..2b61ac44dcc1 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -250,6 +250,14 @@ struct ip_set_req_get_set {
 #define IP_SET_OP_GET_BYINDEX	0x00000007	/* Get set name by index */
 /* Uses ip_set_req_get_set */
 
+#define IP_SET_OP_GET_FNAME	0x00000008	/* Get set index and family */
+struct ip_set_req_get_set_family {
+	unsigned int op;
+	unsigned int version;
+	unsigned int family;
+	union ip_set_name_index set;
+};
+
 #define IP_SET_OP_VERSION	0x00000100	/* Ask kernel version */
 struct ip_set_req_version {
 	unsigned int op;
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index f2e30fb31e78..428c30a8586f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1788,6 +1788,23 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		nfnl_unlock(NFNL_SUBSYS_IPSET);
 		goto copy;
 	}
+	case IP_SET_OP_GET_FNAME: {
+		struct ip_set_req_get_set_family *req_get = data;
+		ip_set_id_t id;
+
+		if (*len != sizeof(struct ip_set_req_get_set_family)) {
+			ret = -EINVAL;
+			goto done;
+		}
+		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+		nfnl_lock(NFNL_SUBSYS_IPSET);
+		find_set_and_id(req_get->set.name, &id);
+		req_get->set.index = id;
+		if (id != IPSET_INVALID_ID)
+			req_get->family = nfnl_set(id)->family;
+		nfnl_unlock(NFNL_SUBSYS_IPSET);
+		goto copy;
+	}
 	case IP_SET_OP_GET_BYINDEX: {
 		struct ip_set_req_get_set *req_get = data;
 		struct ip_set *set;
-- 
cgit v1.2.3


From 68b63f08d22f23161c43cd2417104aa213ff877f Mon Sep 17 00:00:00 2001
From: Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
Date: Sun, 22 Sep 2013 20:56:30 +0200
Subject: netfilter: ipset: Support comments for ipset entries in the core.

This adds the core support for having comments on ipset entries.

The comments are stored as standard null-terminated strings in
dynamically allocated memory after being passed to the kernel. As a
result of this, code has been added to the generic destroy function to
iterate all extensions and call that extension's destroy task if the set
has that extension activated, and if such a task is defined.

Signed-off-by: Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 51 +++++++++++++++++++----
 include/linux/netfilter/ipset/ip_set_comment.h | 57 ++++++++++++++++++++++++++
 include/uapi/linux/netfilter/ipset/ip_set.h    |  8 +++-
 net/netfilter/ipset/ip_set_core.c              | 14 +++++++
 4 files changed, 121 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/netfilter/ipset/ip_set_comment.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 6372ee224fe8..407f84df6a47 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -53,6 +53,8 @@ enum ip_set_extension {
 	IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT),
 	IPSET_EXT_BIT_COUNTER = 1,
 	IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER),
+	IPSET_EXT_BIT_COMMENT = 2,
+	IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT),
 	/* Mark set with an extension which needs to call destroy */
 	IPSET_EXT_BIT_DESTROY = 7,
 	IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY),
@@ -60,11 +62,13 @@ enum ip_set_extension {
 
 #define SET_WITH_TIMEOUT(s)	((s)->extensions & IPSET_EXT_TIMEOUT)
 #define SET_WITH_COUNTER(s)	((s)->extensions & IPSET_EXT_COUNTER)
+#define SET_WITH_COMMENT(s)	((s)->extensions & IPSET_EXT_COMMENT)
 
 /* Extension id, in size order */
 enum ip_set_ext_id {
 	IPSET_EXT_ID_COUNTER = 0,
 	IPSET_EXT_ID_TIMEOUT,
+	IPSET_EXT_ID_COMMENT,
 	IPSET_EXT_ID_MAX,
 };
 
@@ -85,6 +89,7 @@ struct ip_set_ext {
 	u64 packets;
 	u64 bytes;
 	u32 timeout;
+	char *comment;
 };
 
 struct ip_set_counter {
@@ -92,20 +97,19 @@ struct ip_set_counter {
 	atomic64_t packets;
 };
 
-struct ip_set;
+struct ip_set_comment {
+	char *str;
+};
 
-static inline void
-ip_set_ext_destroy(struct ip_set *set, void *data)
-{
-	/* Check that the extension is enabled for the set and
-	 * call it's destroy function for its extension part in data.
-	 */
-}
+struct ip_set;
 
 #define ext_timeout(e, s)	\
 (unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])
 #define ext_counter(e, s)	\
 (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])
+#define ext_comment(e, s)	\
+(struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT])
+
 
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
 			   const struct ip_set_ext *ext,
@@ -222,6 +226,36 @@ struct ip_set {
 	void *data;
 };
 
+static inline void
+ip_set_ext_destroy(struct ip_set *set, void *data)
+{
+	/* Check that the extension is enabled for the set and
+	 * call it's destroy function for its extension part in data.
+	 */
+	if (SET_WITH_COMMENT(set))
+		ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(
+			ext_comment(data, set));
+}
+
+static inline int
+ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
+{
+	u32 cadt_flags = 0;
+
+	if (SET_WITH_TIMEOUT(set))
+		if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+					   htonl(set->timeout))))
+			return -EMSGSIZE;
+	if (SET_WITH_COUNTER(set))
+		cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
+	if (SET_WITH_COMMENT(set))
+		cadt_flags |= IPSET_FLAG_WITH_COMMENT;
+
+	if (!cadt_flags)
+		return 0;
+	return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
+}
+
 static inline void
 ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
 {
@@ -425,6 +459,7 @@ bitmap_bytes(u32 a, u32 b)
 }
 
 #include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_comment.h>
 
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
 	{ .bytes = (skb)->len, .packets = 1,		\
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
new file mode 100644
index 000000000000..21217ea008d7
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -0,0 +1,57 @@
+#ifndef _IP_SET_COMMENT_H
+#define _IP_SET_COMMENT_H
+
+/* Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef __KERNEL__
+
+static inline char*
+ip_set_comment_uget(struct nlattr *tb)
+{
+	return nla_data(tb);
+}
+
+static inline void
+ip_set_init_comment(struct ip_set_comment *comment,
+		    const struct ip_set_ext *ext)
+{
+	size_t len = ext->comment ? strlen(ext->comment) : 0;
+
+	if (unlikely(comment->str)) {
+		kfree(comment->str);
+		comment->str = NULL;
+	}
+	if (!len)
+		return;
+	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
+		len = IPSET_MAX_COMMENT_SIZE;
+	comment->str = kzalloc(len + 1, GFP_ATOMIC);
+	if (unlikely(!comment->str))
+		return;
+	strlcpy(comment->str, ext->comment, len + 1);
+}
+
+static inline int
+ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment)
+{
+	if (!comment->str)
+		return 0;
+	return nla_put_string(skb, IPSET_ATTR_COMMENT, comment->str);
+}
+
+static inline void
+ip_set_comment_free(struct ip_set_comment *comment)
+{
+	if (unlikely(!comment->str))
+		return;
+	kfree(comment->str);
+	comment->str = NULL;
+}
+
+#endif
+#endif
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 2b61ac44dcc1..25d3b2f79c02 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -10,12 +10,14 @@
 #ifndef _UAPI_IP_SET_H
 #define _UAPI_IP_SET_H
 
-
 #include <linux/types.h>
 
 /* The protocol version */
 #define IPSET_PROTOCOL		6
 
+/* The maximum permissible comment length we will accept over netlink */
+#define IPSET_MAX_COMMENT_SIZE	255
+
 /* The max length of strings including NUL: set and type identifiers */
 #define IPSET_MAXNAMELEN	32
 
@@ -110,6 +112,7 @@ enum {
 	IPSET_ATTR_IFACE,
 	IPSET_ATTR_BYTES,
 	IPSET_ATTR_PACKETS,
+	IPSET_ATTR_COMMENT,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
@@ -140,6 +143,7 @@ enum ipset_errno {
 	IPSET_ERR_IPADDR_IPV4,
 	IPSET_ERR_IPADDR_IPV6,
 	IPSET_ERR_COUNTER,
+	IPSET_ERR_COMMENT,
 
 	/* Type specific error codes */
 	IPSET_ERR_TYPE_SPECIFIC = 4352,
@@ -176,6 +180,8 @@ enum ipset_cadt_flags {
 	IPSET_FLAG_NOMATCH	= (1 << IPSET_FLAG_BIT_NOMATCH),
 	IPSET_FLAG_BIT_WITH_COUNTERS = 3,
 	IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS),
+	IPSET_FLAG_BIT_WITH_COMMENT = 4,
+	IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT),
 	IPSET_FLAG_CADT_MAX	= 15,
 };
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index f35afed3814f..3bf9a3d29dff 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -315,6 +315,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
 
+typedef void (*destroyer)(void *);
 /* ipset data extension types, in size order */
 
 const struct ip_set_ext_type ip_set_extensions[] = {
@@ -329,6 +330,13 @@ const struct ip_set_ext_type ip_set_extensions[] = {
 		.len	= sizeof(unsigned long),
 		.align	= __alignof__(unsigned long),
 	},
+	[IPSET_EXT_ID_COMMENT] = {
+		.type	 = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY,
+		.flag	 = IPSET_FLAG_WITH_COMMENT,
+		.len	 = sizeof(struct ip_set_comment),
+		.align	 = __alignof__(struct ip_set_comment),
+		.destroy = (destroyer) ip_set_comment_free,
+	},
 };
 EXPORT_SYMBOL_GPL(ip_set_extensions);
 
@@ -380,6 +388,12 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 			ext->packets = be64_to_cpu(nla_get_be64(
 						   tb[IPSET_ATTR_PACKETS]));
 	}
+	if (tb[IPSET_ATTR_COMMENT]) {
+		if (!(set->extensions & IPSET_EXT_COMMENT))
+			return -IPSET_ERR_COMMENT;
+		ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_set_get_extensions);
-- 
cgit v1.2.3


From 91cb498e6a34b429a032f8cfbb57dde28cd20e0c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 4 Sep 2013 20:57:48 +0200
Subject: netfilter: cttimeout: allow to set/get default protocol timeouts

Default timeouts are currently set via proc/sysctl interface, the
typical pattern is a file name like:

/proc/sys/net/netfilter/nf_conntrack_PROTOCOL_timeout_STATE

This results in one entry per default protocol state timeout.
This patch simplifies this by allowing to set default protocol
timeouts via cttimeout netlink interface.

This should allow us to get rid of the existing proc/sysctl code
in the midterm.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_cttimeout.h |   2 +
 net/netfilter/nfnetlink_cttimeout.c                | 161 ++++++++++++++++++++-
 2 files changed, 155 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
index a2810a7c5e30..1ab0b97b3a1e 100644
--- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
+++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
@@ -6,6 +6,8 @@ enum ctnl_timeout_msg_types {
 	IPCTNL_MSG_TIMEOUT_NEW,
 	IPCTNL_MSG_TIMEOUT_GET,
 	IPCTNL_MSG_TIMEOUT_DELETE,
+	IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+	IPCTNL_MSG_TIMEOUT_DEFAULT_GET,
 
 	IPCTNL_MSG_TIMEOUT_MAX
 };
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 50580494148d..476accd17145 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -49,10 +49,8 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
 };
 
 static int
-ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,
-			  struct nf_conntrack_l4proto *l4proto,
-			  struct net *net,
-			  const struct nlattr *attr)
+ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
+			  struct net *net, const struct nlattr *attr)
 {
 	int ret = 0;
 
@@ -64,8 +62,7 @@ ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,
 		if (ret < 0)
 			return ret;
 
-		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net,
-							  &timeout->data);
+		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
 	}
 	return ret;
 }
@@ -123,7 +120,8 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
 				goto err_proto_put;
 			}
 
-			ret = ctnl_timeout_parse_policy(matching, l4proto, net,
+			ret = ctnl_timeout_parse_policy(&matching->data,
+							l4proto, net,
 							cda[CTA_TIMEOUT_DATA]);
 			return ret;
 		}
@@ -138,7 +136,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
 		goto err_proto_put;
 	}
 
-	ret = ctnl_timeout_parse_policy(timeout, l4proto, net,
+	ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,
 					cda[CTA_TIMEOUT_DATA]);
 	if (ret < 0)
 		goto err;
@@ -342,6 +340,147 @@ cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
 	return ret;
 }
 
+static int
+cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb,
+		      const struct nlmsghdr *nlh,
+		      const struct nlattr * const cda[])
+{
+	__u16 l3num;
+	__u8 l4num;
+	struct nf_conntrack_l4proto *l4proto;
+	struct net *net = sock_net(skb->sk);
+	unsigned int *timeouts;
+	int ret;
+
+	if (!cda[CTA_TIMEOUT_L3PROTO] ||
+	    !cda[CTA_TIMEOUT_L4PROTO] ||
+	    !cda[CTA_TIMEOUT_DATA])
+		return -EINVAL;
+
+	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	/* This protocol is not supported, skip. */
+	if (l4proto->l4proto != l4num) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
+
+	timeouts = l4proto->get_timeouts(net);
+
+	ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+					cda[CTA_TIMEOUT_DATA]);
+	if (ret < 0)
+		goto err;
+
+	nf_ct_l4proto_put(l4proto);
+	return 0;
+err:
+	nf_ct_l4proto_put(l4proto);
+	return ret;
+}
+
+static int
+cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
+			    u32 seq, u32 type, int event,
+			    struct nf_conntrack_l4proto *l4proto)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = AF_UNSPEC;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
+		goto nla_put_failure;
+
+	if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+		struct nlattr *nest_parms;
+		unsigned int *timeouts = l4proto->get_timeouts(net);
+		int ret;
+
+		nest_parms = nla_nest_start(skb,
+					    CTA_TIMEOUT_DATA | NLA_F_NESTED);
+		if (!nest_parms)
+			goto nla_put_failure;
+
+		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+		if (ret < 0)
+			goto nla_put_failure;
+
+		nla_nest_end(skb, nest_parms);
+	}
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb,
+				 const struct nlmsghdr *nlh,
+				 const struct nlattr * const cda[])
+{
+	__u16 l3num;
+	__u8 l4num;
+	struct nf_conntrack_l4proto *l4proto;
+	struct net *net = sock_net(skb->sk);
+	struct sk_buff *skb2;
+	int ret, err;
+
+	if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
+		return -EINVAL;
+
+	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	/* This protocol is not supported, skip. */
+	if (l4proto->l4proto != l4num) {
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
+					  nlh->nlmsg_seq,
+					  NFNL_MSG_TYPE(nlh->nlmsg_type),
+					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+					  l4proto);
+	if (ret <= 0) {
+		kfree_skb(skb2);
+		err = -ENOMEM;
+		goto err;
+	}
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	if (ret > 0)
+		ret = 0;
+
+	/* this avoids a loop in nfnetlink. */
+	return ret == -EAGAIN ? -ENOBUFS : ret;
+err:
+	nf_ct_l4proto_put(l4proto);
+	return err;
+}
+
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)
 {
@@ -384,6 +523,12 @@ static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
 	[IPCTNL_MSG_TIMEOUT_DELETE]	= { .call = cttimeout_del_timeout,
 					    .attr_count = CTA_TIMEOUT_MAX,
 					    .policy = cttimeout_nla_policy },
+	[IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
+					    .attr_count = CTA_TIMEOUT_MAX,
+					    .policy = cttimeout_nla_policy },
+	[IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
+					    .attr_count = CTA_TIMEOUT_MAX,
+					    .policy = cttimeout_nla_policy },
 };
 
 static const struct nfnetlink_subsystem cttimeout_subsys = {
-- 
cgit v1.2.3


From 42c4e0c77ac91505ab94284b14025e3a0865c0a5 Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@linaro.org>
Date: Mon, 30 Sep 2013 14:20:07 +0530
Subject: ARM/ARM64: KVM: Implement KVM_ARM_PREFERRED_TARGET ioctl

For implementing CPU=host, we need a mechanism for querying
preferred VCPU target type on underlying Host.

This patch implements KVM_ARM_PREFERRED_TARGET vm ioctl which
returns struct kvm_vcpu_init instance containing information
about preferred VCPU target type and target specific features
available for it.

Signed-off-by: Anup Patel <anup.patel@linaro.org>
Signed-off-by: Pranavkumar Sawargaonkar <pranavkumar@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/arm.c       | 13 +++++++++++++
 include/uapi/linux/kvm.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9c697db2787e..cc5adb9349ef 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -797,6 +797,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			return -EFAULT;
 		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
 	}
+	case KVM_ARM_PREFERRED_TARGET: {
+		int err;
+		struct kvm_vcpu_init init;
+
+		err = kvm_vcpu_preferred_target(&init);
+		if (err)
+			return err;
+
+		if (copy_to_user(argp, &init, sizeof(init)))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 99c25338ede8..e32e776f20c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1012,6 +1012,7 @@ struct kvm_s390_ucas_mapping {
 /* VM is being stopped by host */
 #define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
 #define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
+#define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
 #define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-- 
cgit v1.2.3


From 32819dc1834866cb9547cb75f81af9edd58d33cd Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@redhat.com>
Date: Wed, 2 Oct 2013 13:39:25 +0200
Subject: bonding: modify the old and add new xmit hash policies

This patch adds two new hash policy modes which use skb_flow_dissect:
3 - Encapsulated layer 2+3
4 - Encapsulated layer 3+4
There should be a good improvement for tunnel users in those modes.
It also changes the old hash functions to:
hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
hash ^= (hash >> 16);
hash ^= (hash >> 8);

Where hash will be initialized either to L2 hash, that is
SRCMAC[5] XOR DSTMAC[5], or to flow->ports which should be extracted
from the upper layer. Flow's dst and src are also extracted based on the
xmit policy either directly from the buffer or by using skb_flow_dissect,
but in both cases if the protocol is IPv6 then dst and src are obtained by
ipv6_addr_hash() on the real addresses. In case of a non-dissectable
packet, the algorithms fall back to L2 hashing.
The bond_set_mode_ops() function is now obsolete and thus deleted
because it was used only to set the proper hash policy. Also we trim a
pointer from struct bonding because we no longer need to keep the hash
function, now there's only a single hash function - bond_xmit_hash that
works based on bond->params.xmit_policy.

The hash function and skb_flow_dissect were suggested by Eric Dumazet.
The layer names were suggested by Andy Gospodarek, because I suck at
semantics.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c   |   2 +-
 drivers/net/bonding/bond_main.c  | 197 ++++++++++++++-------------------------
 drivers/net/bonding/bond_sysfs.c |   2 -
 drivers/net/bonding/bonding.h    |   3 +-
 include/uapi/linux/if_bonding.h  |   2 +
 5 files changed, 72 insertions(+), 134 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index c62606a67f6a..ea3e64e22e22 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2403,7 +2403,7 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 		goto out;
 	}
 
-	slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg);
+	slave_agg_no = bond_xmit_hash(bond, skb, slaves_in_agg);
 	first_ok_slave = NULL;
 
 	bond_for_each_slave(bond, slave, iter) {
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fe8a94f9d7db..dfb4f6dd5de0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -78,6 +78,7 @@
 #include <net/netns/generic.h>
 #include <net/pkt_sched.h>
 #include <linux/rculist.h>
+#include <net/flow_keys.h>
 #include "bonding.h"
 #include "bond_3ad.h"
 #include "bond_alb.h"
@@ -159,7 +160,8 @@ MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on
 module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
-				   "2 for layer 2+3");
+				   "2 for layer 2+3, 3 for encap layer 2+3, "
+				   "4 for encap layer 3+4");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -217,6 +219,8 @@ const struct bond_parm_tbl xmit_hashtype_tbl[] = {
 {	"layer2",		BOND_XMIT_POLICY_LAYER2},
 {	"layer3+4",		BOND_XMIT_POLICY_LAYER34},
 {	"layer2+3",		BOND_XMIT_POLICY_LAYER23},
+{	"encap2+3",		BOND_XMIT_POLICY_ENCAP23},
+{	"encap3+4",		BOND_XMIT_POLICY_ENCAP34},
 {	NULL,			-1},
 };
 
@@ -3035,99 +3039,85 @@ static struct notifier_block bond_netdev_notifier = {
 
 /*---------------------------- Hashing Policies -----------------------------*/
 
-/*
- * Hash for the output device based upon layer 2 data
- */
-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+/* L2 hash helper */
+static inline u32 bond_eth_hash(struct sk_buff *skb)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
 
 	if (skb_headlen(skb) >= offsetof(struct ethhdr, h_proto))
-		return (data->h_dest[5] ^ data->h_source[5]) % count;
+		return data->h_dest[5] ^ data->h_source[5];
 
 	return 0;
 }
 
-/*
- * Hash for the output device based upon layer 2 and layer 3 data. If
- * the packet is not IP, fall back on bond_xmit_hash_policy_l2()
- */
-static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
+/* Extract the appropriate headers based on bond's xmit policy */
+static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
+			      struct flow_keys *fk)
 {
-	const struct ethhdr *data;
+	const struct ipv6hdr *iph6;
 	const struct iphdr *iph;
-	const struct ipv6hdr *ipv6h;
-	u32 v6hash;
-	const __be32 *s, *d;
+	int noff, proto = -1;
 
-	if (skb->protocol == htons(ETH_P_IP) &&
-	    pskb_network_may_pull(skb, sizeof(*iph))) {
+	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
+		return skb_flow_dissect(skb, fk);
+
+	fk->ports = 0;
+	noff = skb_network_offset(skb);
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (!pskb_may_pull(skb, noff + sizeof(*iph)))
+			return false;
 		iph = ip_hdr(skb);
-		data = (struct ethhdr *)skb->data;
-		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
-			(data->h_dest[5] ^ data->h_source[5])) % count;
-	} else if (skb->protocol == htons(ETH_P_IPV6) &&
-		   pskb_network_may_pull(skb, sizeof(*ipv6h))) {
-		ipv6h = ipv6_hdr(skb);
-		data = (struct ethhdr *)skb->data;
-		s = &ipv6h->saddr.s6_addr32[0];
-		d = &ipv6h->daddr.s6_addr32[0];
-		v6hash = (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]);
-		v6hash ^= (v6hash >> 24) ^ (v6hash >> 16) ^ (v6hash >> 8);
-		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
-	}
-
-	return bond_xmit_hash_policy_l2(skb, count);
+		fk->src = iph->saddr;
+		fk->dst = iph->daddr;
+		noff += iph->ihl << 2;
+		if (!ip_is_fragment(iph))
+			proto = iph->protocol;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (!pskb_may_pull(skb, noff + sizeof(*iph6)))
+			return false;
+		iph6 = ipv6_hdr(skb);
+		fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
+		fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		noff += sizeof(*iph6);
+		proto = iph6->nexthdr;
+	} else {
+		return false;
+	}
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
+		fk->ports = skb_flow_get_ports(skb, noff, proto);
+
+	return true;
 }
 
-/*
- * Hash for the output device based upon layer 3 and layer 4 data. If
- * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
- * altogether not IP, fall back on bond_xmit_hash_policy_l2()
+/**
+ * bond_xmit_hash - generate a hash value based on the xmit policy
+ * @bond: bonding device
+ * @skb: buffer to use for headers
+ * @count: modulo value
+ *
+ * This function will extract the necessary headers from the skb buffer and use
+ * them to generate a hash based on the xmit_policy set in the bonding device
+ * which will be reduced modulo count before returning.
  */
-static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
+int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count)
 {
-	u32 layer4_xor = 0;
-	const struct iphdr *iph;
-	const struct ipv6hdr *ipv6h;
-	const __be32 *s, *d;
-	const __be16 *l4 = NULL;
-	__be16 _l4[2];
-	int noff = skb_network_offset(skb);
-	int poff;
-
-	if (skb->protocol == htons(ETH_P_IP) &&
-	    pskb_may_pull(skb, noff + sizeof(*iph))) {
-		iph = ip_hdr(skb);
-		poff = proto_ports_offset(iph->protocol);
+	struct flow_keys flow;
+	u32 hash;
 
-		if (!ip_is_fragment(iph) && poff >= 0) {
-			l4 = skb_header_pointer(skb, noff + (iph->ihl << 2) + poff,
-						sizeof(_l4), &_l4);
-			if (l4)
-				layer4_xor = ntohs(l4[0] ^ l4[1]);
-		}
-		return (layer4_xor ^
-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-	} else if (skb->protocol == htons(ETH_P_IPV6) &&
-		   pskb_may_pull(skb, noff + sizeof(*ipv6h))) {
-		ipv6h = ipv6_hdr(skb);
-		poff = proto_ports_offset(ipv6h->nexthdr);
-		if (poff >= 0) {
-			l4 = skb_header_pointer(skb, noff + sizeof(*ipv6h) + poff,
-						sizeof(_l4), &_l4);
-			if (l4)
-				layer4_xor = ntohs(l4[0] ^ l4[1]);
-		}
-		s = &ipv6h->saddr.s6_addr32[0];
-		d = &ipv6h->daddr.s6_addr32[0];
-		layer4_xor ^= (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]);
-		layer4_xor ^= (layer4_xor >> 24) ^ (layer4_xor >> 16) ^
-			       (layer4_xor >> 8);
-		return layer4_xor % count;
-	}
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
+	    !bond_flow_dissect(bond, skb, &flow))
+		return bond_eth_hash(skb) % count;
+
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
+	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
+		hash = bond_eth_hash(skb);
+	else
+		hash = (__force u32)flow.ports;
+	hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
+	hash ^= (hash >> 16);
+	hash ^= (hash >> 8);
 
-	return bond_xmit_hash_policy_l2(skb, count);
+	return hash % count;
 }
 
 /*-------------------------- Device entry points ----------------------------*/
@@ -3721,8 +3711,7 @@ static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_d
 	return NETDEV_TX_OK;
 }
 
-/*
- * In bond_xmit_xor() , we determine the output device by using a pre-
+/* In bond_xmit_xor() , we determine the output device by using a pre-
  * determined xmit_hash_policy(), If the selected device is not enabled,
  * find the next active slave.
  */
@@ -3730,8 +3719,7 @@ static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond_xmit_slave_id(bond, skb,
-			   bond->xmit_hash_policy(skb, bond->slave_cnt));
+	bond_xmit_slave_id(bond, skb, bond_xmit_hash(bond, skb, bond->slave_cnt));
 
 	return NETDEV_TX_OK;
 }
@@ -3768,22 +3756,6 @@ static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
 
 /*------------------------- Device initialization ---------------------------*/
 
-static void bond_set_xmit_hash_policy(struct bonding *bond)
-{
-	switch (bond->params.xmit_policy) {
-	case BOND_XMIT_POLICY_LAYER23:
-		bond->xmit_hash_policy = bond_xmit_hash_policy_l23;
-		break;
-	case BOND_XMIT_POLICY_LAYER34:
-		bond->xmit_hash_policy = bond_xmit_hash_policy_l34;
-		break;
-	case BOND_XMIT_POLICY_LAYER2:
-	default:
-		bond->xmit_hash_policy = bond_xmit_hash_policy_l2;
-		break;
-	}
-}
-
 /*
  * Lookup the slave that corresponds to a qid
  */
@@ -3894,38 +3866,6 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }
 
-/*
- * set bond mode specific net device operations
- */
-void bond_set_mode_ops(struct bonding *bond, int mode)
-{
-	struct net_device *bond_dev = bond->dev;
-
-	switch (mode) {
-	case BOND_MODE_ROUNDROBIN:
-		break;
-	case BOND_MODE_ACTIVEBACKUP:
-		break;
-	case BOND_MODE_XOR:
-		bond_set_xmit_hash_policy(bond);
-		break;
-	case BOND_MODE_BROADCAST:
-		break;
-	case BOND_MODE_8023AD:
-		bond_set_xmit_hash_policy(bond);
-		break;
-	case BOND_MODE_ALB:
-		/* FALLTHRU */
-	case BOND_MODE_TLB:
-		break;
-	default:
-		/* Should never happen, mode already checked */
-		pr_err("%s: Error: Unknown bonding mode %d\n",
-		       bond_dev->name, mode);
-		break;
-	}
-}
-
 static int bond_ethtool_get_settings(struct net_device *bond_dev,
 				     struct ethtool_cmd *ecmd)
 {
@@ -4027,7 +3967,6 @@ static void bond_setup(struct net_device *bond_dev)
 	ether_setup(bond_dev);
 	bond_dev->netdev_ops = &bond_netdev_ops;
 	bond_dev->ethtool_ops = &bond_ethtool_ops;
-	bond_set_mode_ops(bond, bond->params.mode);
 
 	bond_dev->destructor = bond_destructor;
 
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index e06c644470b1..e9249527e7e7 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -318,7 +318,6 @@ static ssize_t bonding_store_mode(struct device *d,
 	/* don't cache arp_validate between modes */
 	bond->params.arp_validate = BOND_ARP_VALIDATE_NONE;
 	bond->params.mode = new_value;
-	bond_set_mode_ops(bond, bond->params.mode);
 	pr_info("%s: setting mode to %s (%d).\n",
 		bond->dev->name, bond_mode_tbl[new_value].modename,
 		new_value);
@@ -358,7 +357,6 @@ static ssize_t bonding_store_xmit_hash(struct device *d,
 		ret = -EINVAL;
 	} else {
 		bond->params.xmit_policy = new_value;
-		bond_set_mode_ops(bond, bond->params.mode);
 		pr_info("%s: setting xmit hash policy to %s (%d).\n",
 			bond->dev->name,
 			xmit_hashtype_tbl[new_value].modename, new_value);
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 9a26fbd82645..0bd04fbda8e9 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -217,7 +217,6 @@ struct bonding {
 	char     proc_file_name[IFNAMSIZ];
 #endif /* CONFIG_PROC_FS */
 	struct   list_head bond_list;
-	int      (*xmit_hash_policy)(struct sk_buff *, int);
 	u16      rr_tx_counter;
 	struct   ad_bond_info ad_info;
 	struct   alb_bond_info alb_info;
@@ -409,7 +408,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
 void bond_mii_monitor(struct work_struct *);
 void bond_loadbalance_arp_mon(struct work_struct *);
 void bond_activebackup_arp_mon(struct work_struct *);
-void bond_set_mode_ops(struct bonding *bond, int mode);
+int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count);
 int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl);
 void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index a17edda8a781..9635a62f6f89 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -91,6 +91,8 @@
 #define BOND_XMIT_POLICY_LAYER2		0 /* layer 2 (MAC only), default */
 #define BOND_XMIT_POLICY_LAYER34	1 /* layer 3+4 (IP ^ (TCP || UDP)) */
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
+#define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
+#define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
 
 typedef struct ifbond {
 	__s32 bond_mode;
-- 
cgit v1.2.3


From fdfbbd07e91f8fe387140776f3fd94605f0c89e5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 20 Sep 2013 07:40:39 -0700
Subject: perf: Add generic transaction flags

Add a generic qualifier for transaction events, as a new sample
type that returns a flag word. This is particularly useful
for qualifying aborts: to distinguish aborts which happen
due to asynchronous events (like conflicts caused by another
CPU) versus instructions that lead to an abort.

The tuning strategies are very different for those cases,
so it's important to distinguish them easily and early.

Since it's inconvenient and inflexible to filter for this
in the kernel we report all the events out and allow
some post processing in user space.

The flags are based on the Intel TSX events, but should be fairly
generic and mostly applicable to other HTM architectures too. In addition
to various flag words there's also reserved space to report an
program supplied abort code. For TSX this is used to distinguish specific
classes of aborts, like a lock busy abort when doing lock elision.

Flags:

Elision and generic transactions 		   (ELISION vs TRANSACTION)
(HLE vs RTM on TSX; IBM etc.  would likely only use TRANSACTION)
Aborts caused by current thread vs aborts caused by others (SYNC vs ASYNC)
Retryable transaction				   (RETRY)
Conflicts with other threads			   (CONFLICT)
Transaction write capacity overflow		   (CAPACITY WRITE)
Transaction read capacity overflow		   (CAPACITY READ)

Transactions implicitely aborted can also return an abort code.
This can be used to signal specific events to the profiler. A common
case is abort on lock busy in a RTM eliding library (code 0xff)
To handle this case we include the TSX abort code

Common example aborts in TSX would be:

- Data conflict with another thread on memory read.
                                      Flags: TRANSACTION|ASYNC|CONFLICT
- executing a WRMSR in a transaction. Flags: TRANSACTION|SYNC
- HLE transaction in user space is too large
                                      Flags: ELISION|SYNC|CAPACITY-WRITE

The only flag that is somewhat TSX specific is ELISION.

This adds the perf core glue needed for reporting the new flag word out.

v2: Add MEM/MISC
v3: Move transaction to the end
v4: Separate capacity-read/write and remove misc
v5: Remove _SAMPLE. Move abort flags to 32bit. Rename
    transaction to txn
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1379688044-14173-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      |  5 +++++
 include/uapi/linux/perf_event.h | 25 ++++++++++++++++++++++++-
 kernel/events/core.c            |  6 ++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c8ba627c1d60..2e069d1288df 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -584,6 +584,10 @@ struct perf_sample_data {
 	struct perf_regs_user		regs_user;
 	u64				stack_user_size;
 	u64				weight;
+	/*
+	 * Transaction flags for abort events:
+	 */
+	u64				txn;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data,
@@ -599,6 +603,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->stack_user_size = 0;
 	data->weight = 0;
 	data->data_src.val = 0;
+	data->txn = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 009a655a5d35..da48837d617d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -136,8 +136,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_WEIGHT			= 1U << 14,
 	PERF_SAMPLE_DATA_SRC			= 1U << 15,
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
+	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 
-	PERF_SAMPLE_MAX = 1U << 17,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 18,		/* non-ABI */
 };
 
 /*
@@ -180,6 +181,28 @@ enum perf_sample_regs_abi {
 	PERF_SAMPLE_REGS_ABI_64		= 2,
 };
 
+/*
+ * Values for the memory transaction event qualifier, mostly for
+ * abort events. Multiple bits can be set.
+ */
+enum {
+	PERF_TXN_ELISION        = (1 << 0), /* From elision */
+	PERF_TXN_TRANSACTION    = (1 << 1), /* From transaction */
+	PERF_TXN_SYNC           = (1 << 2), /* Instruction is related */
+	PERF_TXN_ASYNC          = (1 << 3), /* Instruction not related */
+	PERF_TXN_RETRY          = (1 << 4), /* Retry possible */
+	PERF_TXN_CONFLICT       = (1 << 5), /* Conflict abort */
+	PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */
+	PERF_TXN_CAPACITY_READ  = (1 << 7), /* Capacity read abort */
+
+	PERF_TXN_MAX	        = (1 << 8), /* non-ABI */
+
+	/* bits 32..63 are reserved for the abort code */
+
+	PERF_TXN_ABORT_MASK  = (0xffffffffULL << 32),
+	PERF_TXN_ABORT_SHIFT = 32,
+};
+
 /*
  * The format of the data returned by read() on a perf event fd,
  * as specified by attr.read_format:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b25d65ce7106..c716385f6483 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1201,6 +1201,9 @@ static void perf_event__header_size(struct perf_event *event)
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		size += sizeof(data->data_src.val);
 
+	if (sample_type & PERF_SAMPLE_TRANSACTION)
+		size += sizeof(data->txn);
+
 	event->header_size = size;
 }
 
@@ -4572,6 +4575,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
 
+	if (sample_type & PERF_SAMPLE_TRANSACTION)
+		perf_output_put(handle, data->txn);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
-- 
cgit v1.2.3


From af190494f9b2e1fb6e1c039e9626c3c334717da1 Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Thu, 3 Oct 2013 18:06:23 -0700
Subject: misc: mic: Enable OSPM suspend and resume support.

This patch enables support for OSPM suspend and resume in the MIC
driver. During a host suspend event, the driver performs an
orderly shutdown of the cards if they are online. Upon resume, any
cards that were previously online before suspend are rebooted.
The driver performs an orderly shutdown of the card primarily to
ensure that applications in the card are terminated and mounted
devices are safely un-mounted before the card is powered down in
the event of an OSPM suspend.

The driver makes use of the MIC daemon to accomplish OSPM suspend
and resume. The driver registers a PM notifier per MIC device.
The devices get notified synchronously during PM_SUSPEND_PREPARE and
PM_POST_SUSPEND phases.

During the PM_SUSPEND_PREPARE phase, the driver performs one of the
following three tasks.
1) If the card is 'offline', the driver sets the card to a
   'suspended' state and returns.
2) If the card is 'online', the driver initiates card shutdown by
   setting the card state to suspending. This notifies the MIC
   daemon which invokes shutdown and sets card state to 'suspended'.
   The driver returns after the shutdown is complete.
3) If the card is already being shutdown, possibly by a host user
   space application, the driver sets the card state to 'suspended'
   and returns after the shutdown is complete.

During the PM_POST_SUSPEND phase, the driver simply notifies the
daemon and returns. The daemon boots those cards that were previously
online during the suspend phase.

Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Harshavardhan R Kharche <harshavardhan.r.kharche@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-mic.txt |  60 +++++++------
 Documentation/mic/mic_overview.txt            |   4 +-
 Documentation/mic/mpssd/mpssd.c               |  27 +++++-
 Documentation/mic/mpssd/mpssd.h               |   1 +
 drivers/misc/mic/host/mic_boot.c              | 120 +++++++++++++++++++++++++-
 drivers/misc/mic/host/mic_device.h            |   8 ++
 drivers/misc/mic/host/mic_main.c              |  63 +++++++++++++-
 drivers/misc/mic/host/mic_sysfs.c             |   7 ++
 include/uapi/linux/mic_common.h               |   2 +
 9 files changed, 257 insertions(+), 35 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-mic.txt b/Documentation/ABI/testing/sysfs-class-mic.txt
index 82cdad3b614a..13f48afc534f 100644
--- a/Documentation/ABI/testing/sysfs-class-mic.txt
+++ b/Documentation/ABI/testing/sysfs-class-mic.txt
@@ -1,6 +1,6 @@
 What:		/sys/class/mic/
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		The mic class directory belongs to Intel MIC devices and
@@ -9,8 +9,8 @@ Description:
 		Integrated Core (MIC) architecture that runs a Linux OS.
 
 What:		/sys/class/mic/mic(x)
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		The directories /sys/class/mic/mic0, /sys/class/mic/mic1 etc.,
@@ -18,33 +18,41 @@ Description:
 		information specific to that MIC device.
 
 What:		/sys/class/mic/mic(x)/family
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		Provides information about the Coprocessor family for an Intel
 		MIC device. For example - "x100"
 
 What:		/sys/class/mic/mic(x)/stepping
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		Provides information about the silicon stepping for an Intel
 		MIC device. For example - "A0" or "B0"
 
 What:		/sys/class/mic/mic(x)/state
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		When read, this entry provides the current state of an Intel
 		MIC device in the context of the card OS. Possible values that
 		will be read are:
-		"offline" - The MIC device is ready to boot the card OS.
+		"offline" - The MIC device is ready to boot the card OS. On
+		reading this entry after an OSPM resume, a "boot" has to be
+		written to this entry if the card was previously shutdown
+		during OSPM suspend.
 		"online" - The MIC device has initiated booting a card OS.
 		"shutting_down" - The card OS is shutting down.
 		"reset_failed" - The MIC device has failed to reset.
+		"suspending" - The MIC device is currently being prepared for
+		suspend. On reading this entry, a "suspend" has to be written
+		to the state sysfs entry to ensure the card is shutdown during
+		OSPM suspend.
+		"suspended" - The MIC device has been suspended.
 
 		When written, this sysfs entry triggers different state change
 		operations depending upon the current state of the card OS.
@@ -54,10 +62,12 @@ Description:
 			sysfs entries.
 		"reset" - Initiates device reset.
 		"shutdown" - Initiates card OS shutdown.
+		"suspend" - Initiates card OS shutdown and also marks the card
+		as suspended.
 
 What:		/sys/class/mic/mic(x)/shutdown_status
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		An Intel MIC device runs a Linux OS during its operation. This
@@ -72,8 +82,8 @@ Description:
 		"restart" - Shutdown because of a restart command.
 
 What:		/sys/class/mic/mic(x)/cmdline
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		An Intel MIC device runs a Linux OS during its operation. Before
@@ -88,8 +98,8 @@ Description:
 		line back to this entry.
 
 What:		/sys/class/mic/mic(x)/firmware
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		When read, this sysfs entry provides the path name under
@@ -98,8 +108,8 @@ Description:
 		firmware image location under /lib/firmware/.
 
 What:		/sys/class/mic/mic(x)/ramdisk
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		When read, this sysfs entry provides the path name under
@@ -108,8 +118,8 @@ Description:
 		the ramdisk image location under /lib/firmware/.
 
 What:		/sys/class/mic/mic(x)/bootmode
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		When read, this sysfs entry provides the current bootmode for
@@ -119,8 +129,8 @@ Description:
 		b) elf - Boot an elf image for flash updates.
 
 What:		/sys/class/mic/mic(x)/log_buf_addr
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		An Intel MIC device runs a Linux OS during its operation. For
@@ -133,8 +143,8 @@ Description:
 		file of the card OS.
 
 What:		/sys/class/mic/mic(x)/log_buf_len
-Date:		August 2013
-KernelVersion:	3.11
+Date:		October 2013
+KernelVersion:	3.13
 Contact:	Sudeep Dutt <sudeep.dutt@intel.com>
 Description:
 		An Intel MIC device runs a Linux OS during its operation. For
diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.txt
index c4424ed1b746..b41929224804 100644
--- a/Documentation/mic/mic_overview.txt
+++ b/Documentation/mic/mic_overview.txt
@@ -4,7 +4,9 @@ that runs a Linux OS. It is a PCIe endpoint in a platform and therefore
 implements the three required standard address spaces i.e. configuration,
 memory and I/O. The host OS loads a device driver as is typical for
 PCIe devices. The card itself runs a bootstrap after reset that
-transfers control to the card OS downloaded from the host driver.
+transfers control to the card OS downloaded from the host driver. The
+host driver supports OSPM suspend and resume operations. It shuts down
+the card during suspend and reboots the card OS during resume.
 The card OS as shipped by Intel is a Linux kernel with modifications
 for the X100 devices.
 
diff --git a/Documentation/mic/mpssd/mpssd.c b/Documentation/mic/mpssd/mpssd.c
index 82c6bc2e3cb6..0c980ad40b17 100644
--- a/Documentation/mic/mpssd/mpssd.c
+++ b/Documentation/mic/mpssd/mpssd.c
@@ -1295,7 +1295,13 @@ reset(struct mic_info *mic)
 			goto retry;
 		mpsslog("%s: %s %d state %s\n",
 			mic->name, __func__, __LINE__, state);
-		if ((!strcmp(state, "offline"))) {
+
+		/*
+		 * If the shutdown was initiated by OSPM, the state stays
+		 * in "suspended" which is also a valid condition for reset.
+		 */
+		if ((!strcmp(state, "offline")) ||
+		    (!strcmp(state, "suspended"))) {
 			free(state);
 			break;
 		}
@@ -1334,6 +1340,10 @@ static int get_mic_state(struct mic_info *mic, char *state)
 		return MIC_SHUTTING_DOWN;
 	if (!strcmp(state, "reset_failed"))
 		return MIC_RESET_FAILED;
+	if (!strcmp(state, "suspending"))
+		return MIC_SUSPENDING;
+	if (!strcmp(state, "suspended"))
+		return MIC_SUSPENDED;
 	mpsslog("%s: BUG invalid state %s\n", mic->name, state);
 	/* Invalid state */
 	assert(0);
@@ -1418,6 +1428,17 @@ retry:
 		case MIC_SHUTTING_DOWN:
 			mic_handle_shutdown(mic);
 			goto close_error;
+		case MIC_SUSPENDING:
+			mic->boot_on_resume = 1;
+			setsysfs(mic->name, "state", "suspend");
+			mic_handle_shutdown(mic);
+			goto close_error;
+		case MIC_OFFLINE:
+			if (mic->boot_on_resume) {
+				setsysfs(mic->name, "state", "boot");
+				mic->boot_on_resume = 0;
+			}
+			break;
 		default:
 			break;
 		}
@@ -1621,11 +1642,9 @@ init_mic_list(void)
 
 	while ((file = readdir(dp)) != NULL) {
 		if (!strncmp(file->d_name, "mic", 3)) {
-			mic->next = malloc(sizeof(struct mic_info));
+			mic->next = calloc(1, sizeof(struct mic_info));
 			if (mic->next) {
 				mic = mic->next;
-				mic->next = NULL;
-				memset(mic, 0, sizeof(struct mic_info));
 				mic->id = atoi(&file->d_name[3]);
 				mic->name = malloc(strlen(file->d_name) + 16);
 				if (mic->name)
diff --git a/Documentation/mic/mpssd/mpssd.h b/Documentation/mic/mpssd/mpssd.h
index ccd589ff9146..f5f18b15d9a0 100644
--- a/Documentation/mic/mpssd/mpssd.h
+++ b/Documentation/mic/mpssd/mpssd.h
@@ -91,6 +91,7 @@ struct mic_info {
 	struct mic_net_info	mic_net;
 	struct mic_virtblk_info	mic_virtblk;
 	int		restart;
+	int		boot_on_resume;
 	struct mic_info *next;
 };
 
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index 60c54d5c43c2..d56b8bf4eaf6 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -37,12 +37,13 @@ static void mic_reset(struct mic_device *mdev)
 
 #define MIC_RESET_TO (45)
 
+	INIT_COMPLETION(mdev->reset_wait);
 	mdev->ops->reset_fw_ready(mdev);
 	mdev->ops->reset(mdev);
 
 	for (i = 0; i < MIC_RESET_TO; i++) {
 		if (mdev->ops->is_fw_ready(mdev))
-			return;
+			goto done;
 		/*
 		 * Resets typically take 10s of seconds to complete.
 		 * Since an MMIO read is required to check if the
@@ -51,6 +52,8 @@ static void mic_reset(struct mic_device *mdev)
 		msleep(1000);
 	}
 	mic_set_state(mdev, MIC_RESET_FAILED);
+done:
+	complete_all(&mdev->reset_wait);
 }
 
 /* Initialize the MIC bootparams */
@@ -123,7 +126,8 @@ void mic_stop(struct mic_device *mdev, bool force)
 		if (MIC_RESET_FAILED == mdev->state)
 			goto unlock;
 		mic_set_shutdown_status(mdev, MIC_NOP);
-		mic_set_state(mdev, MIC_OFFLINE);
+		if (MIC_SUSPENDED != mdev->state)
+			mic_set_state(mdev, MIC_OFFLINE);
 	}
 unlock:
 	mutex_unlock(&mdev->mic_mutex);
@@ -165,7 +169,14 @@ void mic_shutdown_work(struct work_struct *work)
 	mutex_lock(&mdev->mic_mutex);
 	mic_set_shutdown_status(mdev, bootparam->shutdown_status);
 	bootparam->shutdown_status = 0;
-	if (MIC_SHUTTING_DOWN != mdev->state)
+
+	/*
+	 * if state is MIC_SUSPENDED, OSPM suspend is in progress. We do not
+	 * change the state here so as to prevent users from booting the card
+	 * during and after the suspend operation.
+	 */
+	if (MIC_SHUTTING_DOWN != mdev->state &&
+	    MIC_SUSPENDED != mdev->state)
 		mic_set_state(mdev, MIC_SHUTTING_DOWN);
 	mutex_unlock(&mdev->mic_mutex);
 }
@@ -183,3 +194,106 @@ void mic_reset_trigger_work(struct work_struct *work)
 
 	mic_stop(mdev, false);
 }
+
+/**
+ * mic_complete_resume - Complete MIC Resume after an OSPM suspend/hibernate
+ * event.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: None.
+ */
+void mic_complete_resume(struct mic_device *mdev)
+{
+	if (mdev->state != MIC_SUSPENDED) {
+		dev_warn(mdev->sdev->parent, "state %d should be %d\n",
+			 mdev->state, MIC_SUSPENDED);
+		return;
+	}
+
+	/* Make sure firmware is ready */
+	if (!mdev->ops->is_fw_ready(mdev))
+		mic_stop(mdev, true);
+
+	mutex_lock(&mdev->mic_mutex);
+	mic_set_state(mdev, MIC_OFFLINE);
+	mutex_unlock(&mdev->mic_mutex);
+}
+
+/**
+ * mic_prepare_suspend - Handle suspend notification for the MIC device.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: None.
+ */
+void mic_prepare_suspend(struct mic_device *mdev)
+{
+	int rc;
+
+#define MIC_SUSPEND_TIMEOUT (60 * HZ)
+
+	mutex_lock(&mdev->mic_mutex);
+	switch (mdev->state) {
+	case MIC_OFFLINE:
+		/*
+		 * Card is already offline. Set state to MIC_SUSPENDED
+		 * to prevent users from booting the card.
+		 */
+		mic_set_state(mdev, MIC_SUSPENDED);
+		mutex_unlock(&mdev->mic_mutex);
+		break;
+	case MIC_ONLINE:
+		/*
+		 * Card is online. Set state to MIC_SUSPENDING and notify
+		 * MIC user space daemon which will issue card
+		 * shutdown and reset.
+		 */
+		mic_set_state(mdev, MIC_SUSPENDING);
+		mutex_unlock(&mdev->mic_mutex);
+		rc = wait_for_completion_timeout(&mdev->reset_wait,
+						MIC_SUSPEND_TIMEOUT);
+		/* Force reset the card if the shutdown completion timed out */
+		if (!rc) {
+			mutex_lock(&mdev->mic_mutex);
+			mic_set_state(mdev, MIC_SUSPENDED);
+			mutex_unlock(&mdev->mic_mutex);
+			mic_stop(mdev, true);
+		}
+		break;
+	case MIC_SHUTTING_DOWN:
+		/*
+		 * Card is shutting down. Set state to MIC_SUSPENDED
+		 * to prevent further boot of the card.
+		 */
+		mic_set_state(mdev, MIC_SUSPENDED);
+		mutex_unlock(&mdev->mic_mutex);
+		rc = wait_for_completion_timeout(&mdev->reset_wait,
+						MIC_SUSPEND_TIMEOUT);
+		/* Force reset the card if the shutdown completion timed out */
+		if (!rc)
+			mic_stop(mdev, true);
+		break;
+	default:
+		mutex_unlock(&mdev->mic_mutex);
+		break;
+	}
+}
+
+/**
+ * mic_suspend - Initiate MIC suspend. Suspend merely issues card shutdown.
+ * @mdev: pointer to mic_device instance
+ *
+ * RETURNS: None.
+ */
+void mic_suspend(struct mic_device *mdev)
+{
+	struct mic_bootparam *bootparam = mdev->dp;
+	s8 db = bootparam->h2c_shutdown_db;
+
+	mutex_lock(&mdev->mic_mutex);
+	if (MIC_SUSPENDING == mdev->state && db != -1) {
+		bootparam->shutdown_card = 1;
+		mdev->ops->send_intr(mdev, db);
+		mic_set_state(mdev, MIC_SUSPENDED);
+	}
+	mutex_unlock(&mdev->mic_mutex);
+}
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index dcba2a59e77f..3574cc375bb9 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -23,6 +23,7 @@
 
 #include <linux/cdev.h>
 #include <linux/idr.h>
+#include <linux/notifier.h>
 
 #include "mic_intr.h"
 
@@ -75,6 +76,7 @@ enum mic_stepping {
  * @state: MIC state.
  * @shutdown_status: MIC status reported by card for shutdown/crashes.
  * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes.
+ * @reset_wait: Waitqueue for sleeping while reset completes.
  * @log_buf_addr: Log buffer address for MIC.
  * @log_buf_len: Log buffer length address for MIC.
  * @dp: virtio device page
@@ -83,6 +85,7 @@ enum mic_stepping {
  * @shutdown_cookie: shutdown cookie.
  * @cdev: Character device for MIC.
  * @vdev_list: list of virtio devices.
+ * @pm_notifier: Handles PM notifications from the OS.
  */
 struct mic_device {
 	struct mic_mw mmio;
@@ -110,6 +113,7 @@ struct mic_device {
 	u8 state;
 	u8 shutdown_status;
 	struct sysfs_dirent *state_sysfs;
+	struct completion reset_wait;
 	void *log_buf_addr;
 	int *log_buf_len;
 	void *dp;
@@ -118,6 +122,7 @@ struct mic_device {
 	struct mic_irq *shutdown_cookie;
 	struct cdev cdev;
 	struct list_head vdev_list;
+	struct notifier_block pm_notifier;
 };
 
 /**
@@ -192,4 +197,7 @@ void mic_create_debug_dir(struct mic_device *dev);
 void mic_delete_debug_dir(struct mic_device *dev);
 void __init mic_init_debugfs(void);
 void mic_exit_debugfs(void);
+void mic_prepare_suspend(struct mic_device *mdev);
+void mic_complete_resume(struct mic_device *mdev);
+void mic_suspend(struct mic_device *mdev);
 #endif
diff --git a/drivers/misc/mic/host/mic_main.c b/drivers/misc/mic/host/mic_main.c
index ca06aa9b7114..b3520859abd3 100644
--- a/drivers/misc/mic/host/mic_main.c
+++ b/drivers/misc/mic/host/mic_main.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/poll.h>
+#include <linux/suspend.h>
 
 #include <linux/mic_common.h>
 #include "../common/mic_dev.h"
@@ -186,6 +187,43 @@ static enum mic_hw_family mic_get_family(struct pci_dev *pdev)
 	return family;
 }
 
+/**
+* mic_pm_notifier: Notifier callback function that handles
+* PM notifications.
+*
+* @notifier_block: The notifier structure.
+* @pm_event: The event for which the driver was notified.
+* @unused: Meaningless. Always NULL.
+*
+* returns NOTIFY_DONE
+*/
+static int mic_pm_notifier(struct notifier_block *notifier,
+		unsigned long pm_event, void *unused)
+{
+	struct mic_device *mdev = container_of(notifier,
+		struct mic_device, pm_notifier);
+
+	switch (pm_event) {
+	case PM_HIBERNATION_PREPARE:
+		/* Fall through */
+	case PM_SUSPEND_PREPARE:
+		mic_prepare_suspend(mdev);
+		break;
+	case PM_POST_HIBERNATION:
+		/* Fall through */
+	case PM_POST_SUSPEND:
+		/* Fall through */
+	case PM_POST_RESTORE:
+		mic_complete_resume(mdev);
+		break;
+	case PM_RESTORE_PREPARE:
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 /**
  * mic_device_init - Allocates and initializes the MIC device structure
  *
@@ -194,9 +232,11 @@ static enum mic_hw_family mic_get_family(struct pci_dev *pdev)
  *
  * returns none.
  */
-static void
+static int
 mic_device_init(struct mic_device *mdev, struct pci_dev *pdev)
 {
+	int rc;
+
 	mdev->family = mic_get_family(pdev);
 	mdev->stepping = pdev->revision;
 	mic_ops_init(mdev);
@@ -205,7 +245,20 @@ mic_device_init(struct mic_device *mdev, struct pci_dev *pdev)
 	mdev->irq_info.next_avail_src = 0;
 	INIT_WORK(&mdev->reset_trigger_work, mic_reset_trigger_work);
 	INIT_WORK(&mdev->shutdown_work, mic_shutdown_work);
+	init_completion(&mdev->reset_wait);
 	INIT_LIST_HEAD(&mdev->vdev_list);
+	mdev->pm_notifier.notifier_call = mic_pm_notifier;
+	rc = register_pm_notifier(&mdev->pm_notifier);
+	if (rc) {
+		dev_err(&pdev->dev, "register_pm_notifier failed rc %d\n",
+			rc);
+		goto register_pm_notifier_fail;
+	}
+	return 0;
+register_pm_notifier_fail:
+	flush_work(&mdev->shutdown_work);
+	flush_work(&mdev->reset_trigger_work);
+	return rc;
 }
 
 /**
@@ -224,6 +277,7 @@ static void mic_device_uninit(struct mic_device *mdev)
 	kfree(mdev->bootmode);
 	flush_work(&mdev->reset_trigger_work);
 	flush_work(&mdev->shutdown_work);
+	unregister_pm_notifier(&mdev->pm_notifier);
 }
 
 /**
@@ -253,7 +307,11 @@ static int mic_probe(struct pci_dev *pdev,
 		goto ida_fail;
 	}
 
-	mic_device_init(mdev, pdev);
+	rc = mic_device_init(mdev, pdev);
+	if (rc) {
+		dev_err(&pdev->dev, "mic_device_init failed rc %d\n", rc);
+		goto device_init_fail;
+	}
 
 	rc = pci_enable_device(pdev);
 	if (rc) {
@@ -376,6 +434,7 @@ disable_device:
 	pci_disable_device(pdev);
 uninit_device:
 	mic_device_uninit(mdev);
+device_init_fail:
 	ida_simple_remove(&g_mic_ida, mdev->id);
 ida_fail:
 	kfree(mdev);
diff --git a/drivers/misc/mic/host/mic_sysfs.c b/drivers/misc/mic/host/mic_sysfs.c
index 75746adfb155..6dd864e4a617 100644
--- a/drivers/misc/mic/host/mic_sysfs.c
+++ b/drivers/misc/mic/host/mic_sysfs.c
@@ -33,6 +33,8 @@ static const char * const mic_state_string[] = {
 	[MIC_ONLINE] = "online",
 	[MIC_SHUTTING_DOWN] = "shutting_down",
 	[MIC_RESET_FAILED] = "reset_failed",
+	[MIC_SUSPENDING] = "suspending",
+	[MIC_SUSPENDED] = "suspended",
 };
 
 /*
@@ -156,6 +158,11 @@ state_store(struct device *dev, struct device_attribute *attr,
 		goto done;
 	}
 
+	if (sysfs_streq(buf, "suspend")) {
+		mic_suspend(mdev);
+		goto done;
+	}
+
 	count = -EINVAL;
 done:
 	return count;
diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h
index 364ac751bd04..17e7d95e4f53 100644
--- a/include/uapi/linux/mic_common.h
+++ b/include/uapi/linux/mic_common.h
@@ -219,6 +219,8 @@ enum mic_states {
 	MIC_ONLINE,
 	MIC_SHUTTING_DOWN,
 	MIC_RESET_FAILED,
+	MIC_SUSPENDING,
+	MIC_SUSPENDED,
 	MIC_LAST
 };
 
-- 
cgit v1.2.3


From fe1811438a2a229e93beaefa69481d72652795e5 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
Date: Mon, 7 Oct 2013 16:27:55 -0700
Subject: cfg80211: fix nl80211.h documentation for DFS enum states

The names are prefixed incorrectly on the documentation.

Signed-off-by: Luis R. Rodriguez <mcgrof@do-not-panic.com>
[also remove spurious blank line]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index fde2c021b26d..a58ea652cc24 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3860,13 +3860,12 @@ enum nl80211_radar_event {
  *
  * Channel states used by the DFS code.
  *
- * @IEEE80211_DFS_USABLE: The channel can be used, but channel availability
+ * @NL80211_DFS_USABLE: The channel can be used, but channel availability
  *	check (CAC) must be performed before using it for AP or IBSS.
- * @IEEE80211_DFS_UNAVAILABLE: A radar has been detected on this channel, it
+ * @NL80211_DFS_UNAVAILABLE: A radar has been detected on this channel, it
  *	is therefore marked as not available.
- * @IEEE80211_DFS_AVAILABLE: The channel has been CAC checked and is available.
+ * @NL80211_DFS_AVAILABLE: The channel has been CAC checked and is available.
  */
-
 enum nl80211_dfs_state {
 	NL80211_DFS_USABLE,
 	NL80211_DFS_UNAVAILABLE,
-- 
cgit v1.2.3


From 789fd03331aa1ec45cb58168e2d82525c97c7351 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
Date: Fri, 4 Oct 2013 18:07:24 -0700
Subject: cfg80211: rename regulatory_hint_11d() to
 regulatory_hint_country_ie()

It is incorrect to refer to this as 11d as 802.11d was just a
proposed amendment, 802.11d was merged to the standard so
use proper terminology.

Signed-off-by: Luis R. Rodriguez <mcgrof@do-not-panic.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 net/wireless/reg.c           | 4 ++--
 net/wireless/reg.h           | 4 ++--
 net/wireless/sme.c           | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a58ea652cc24..8c0417c222c6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -988,7 +988,7 @@ enum nl80211_commands {
  * 	to query the CRDA to retrieve one regulatory domain. This attribute can
  * 	also be used by userspace to query the kernel for the currently set
  * 	regulatory domain. We chose an alpha2 as that is also used by the
- * 	IEEE-802.11d country information element to identify a country.
+ * 	IEEE-802.11 country information element to identify a country.
  * 	Users can also simply ask the wireless core to set regulatory domain
  * 	to a specific alpha2.
  * @NL80211_ATTR_REG_RULES: a nested array of regulatory domain regulatory
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d62cb1e91475..8fbe664fdcf8 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1699,8 +1699,8 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
 }
 EXPORT_SYMBOL(regulatory_hint);
 
-void regulatory_hint_11d(struct wiphy *wiphy, enum ieee80211_band band,
-			 const u8 *country_ie, u8 country_ie_len)
+void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band,
+				const u8 *country_ie, u8 country_ie_len)
 {
 	char alpha2[2];
 	enum environment_cap env = ENVIRON_ANY;
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index af2d5f8a5d82..9677e3c13da9 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -58,7 +58,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
 				 gfp_t gfp);
 
 /**
- * regulatory_hint_11d - hints a country IE as a regulatory domain
+ * regulatory_hint_country_ie - hints a country IE as a regulatory domain
  * @wiphy: the wireless device giving the hint (used only for reporting
  *	conflicts)
  * @band: the band on which the country IE was received on. This determines
@@ -78,7 +78,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
  * not observed. For this reason if a triplet is seen with channel
  * information for a band the BSS is not present in it will be ignored.
  */
-void regulatory_hint_11d(struct wiphy *wiphy,
+void regulatory_hint_country_ie(struct wiphy *wiphy,
 			 enum ieee80211_band band,
 			 const u8 *country_ie,
 			 u8 country_ie_len);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 20e86a95dc4e..65f800890d70 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -682,8 +682,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 	 * - country_ie + 2, the start of the country ie data, and
 	 * - and country_ie[1] which is the IE length
 	 */
-	regulatory_hint_11d(wdev->wiphy, bss->channel->band,
-			    country_ie + 2, country_ie[1]);
+	regulatory_hint_country_ie(wdev->wiphy, bss->channel->band,
+				   country_ie + 2, country_ie[1]);
 	kfree(country_ie);
 }
 
-- 
cgit v1.2.3


From c01fc9ada926aaad907989ca2eba40c2a2a73afe Mon Sep 17 00:00:00 2001
From: Sunil Dutt <c_duttus@qti.qualcomm.com>
Date: Wed, 9 Oct 2013 20:45:21 +0530
Subject: cfg80211: pass station supported channel and oper class info

The information of the peer's supported channels and supported operating
classes are required for the driver to perform TDLS off channel
operations. This commit enhances the function nl80211_(new)set_station
to pass this information of the peer to the driver.

Signed-off-by: Sunil Dutt <c_duttus@qti.qualcomm.com>
[return errors for malformed tuples]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  8 ++++++++
 include/uapi/linux/nl80211.h |  9 +++++++++
 net/wireless/nl80211.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 45f6bf591104..5db5fe24eff6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -744,6 +744,10 @@ enum station_parameters_apply_mask {
  * @capability: station capability
  * @ext_capab: extended capabilities of the station
  * @ext_capab_len: number of extended capabilities
+ * @supported_channels: supported channels in IEEE 802.11 format
+ * @supported_channels_len: number of supported channels
+ * @supported_oper_classes: supported oper classes in IEEE 802.11 format
+ * @supported_oper_classes_len: number of supported operating classes
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -763,6 +767,10 @@ struct station_parameters {
 	u16 capability;
 	const u8 *ext_capab;
 	u8 ext_capab_len;
+	const u8 *supported_channels;
+	u8 supported_channels_len;
+	const u8 *supported_oper_classes;
+	u8 supported_oper_classes_len;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8c0417c222c6..f2aef2a7a570 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1496,6 +1496,11 @@ enum nl80211_commands {
  * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32.
  *	As specified in the &enum nl80211_rxmgmt_flags.
  *
+ * @NL80211_ATTR_STA_SUPPORTED_CHANNELS: array of supported channels.
+ *
+ * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported
+ *      supported operating classes.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1806,6 +1811,10 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_RXMGMT_FLAGS,
 
+	NL80211_ATTR_STA_SUPPORTED_CHANNELS,
+
+	NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2838206ddad3..460638ac2d73 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -354,6 +354,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
 	[NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 },
 	[NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 },
+	[NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY },
+	[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY },
 };
 
 /* policy for the key attributes */
@@ -3896,9 +3898,45 @@ static int nl80211_parse_sta_wme(struct genl_info *info,
 	return 0;
 }
 
+static int nl80211_parse_sta_channel_info(struct genl_info *info,
+				      struct station_parameters *params)
+{
+	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]) {
+		params->supported_channels =
+		     nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
+		params->supported_channels_len =
+		     nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
+		/*
+		 * Need to include at least one (first channel, number of
+		 * channels) tuple for each subband, and must have proper
+		 * tuples for the rest of the data as well.
+		 */
+		if (params->supported_channels_len < 2)
+			return -EINVAL;
+		if (params->supported_channels_len % 2)
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]) {
+		params->supported_oper_classes =
+		 nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
+		params->supported_oper_classes_len =
+		  nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
+		/*
+		 * The value of the Length field of the Supported Operating
+		 * Classes element is between 2 and 253.
+		 */
+		if (params->supported_oper_classes_len < 2 ||
+		    params->supported_oper_classes_len > 253)
+			return -EINVAL;
+	}
+	return 0;
+}
+
 static int nl80211_set_station_tdls(struct genl_info *info,
 				    struct station_parameters *params)
 {
+	int err;
 	/* Dummy STA entry gets updated once the peer capabilities are known */
 	if (info->attrs[NL80211_ATTR_PEER_AID])
 		params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
@@ -3909,6 +3947,10 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 		params->vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
 
+	err = nl80211_parse_sta_channel_info(info, params);
+	if (err)
+		return err;
+
 	return nl80211_parse_sta_wme(info, params);
 }
 
@@ -4089,6 +4131,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
+	err = nl80211_parse_sta_channel_info(info, &params);
+	if (err)
+		return err;
+
 	err = nl80211_parse_sta_wme(info, &params);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 96518518cc417bb0a8c80b9fb736202e28acdf96 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 14 Oct 2013 11:00:02 +0200
Subject: netfilter: add nftables

This patch adds nftables which is the intended successor of iptables.
This packet filtering framework reuses the existing netfilter hooks,
the connection tracking system, the NAT subsystem, the transparent
proxying engine, the logging infrastructure and the userspace packet
queueing facilities.

In a nutshell, nftables provides a pseudo-state machine with 4 general
purpose registers of 128 bits and 1 specific purpose register to store
verdicts. This pseudo-machine comes with an extensible instruction set,
a.k.a. "expressions" in the nftables jargon. The expressions included
in this patch provide the basic functionality, they are:

* bitwise: to perform bitwise operations.
* byteorder: to change from host/network endianess.
* cmp: to compare data with the content of the registers.
* counter: to enable counters on rules.
* ct: to store conntrack keys into register.
* exthdr: to match IPv6 extension headers.
* immediate: to load data into registers.
* limit: to limit matching based on packet rate.
* log: to log packets.
* meta: to match metainformation that usually comes with the skbuff.
* nat: to perform Network Address Translation.
* payload: to fetch data from the packet payload and store it into
  registers.
* reject (IPv4 only): to explicitly close connection, eg. TCP RST.

Using this instruction-set, the userspace utility 'nft' can transform
the rules expressed in human-readable text representation (using a
new syntax, inspired by tcpdump) to nftables bytecode.

nftables also inherits the table, chain and rule objects from
iptables, but in a more configurable way, and it also includes the
original datatype-agnostic set infrastructure with mapping support.
This set infrastructure is enhanced in the follow up patch (netfilter:
nf_tables: add netlink set API).

This patch includes the following components:

* the netlink API: net/netfilter/nf_tables_api.c and
  include/uapi/netfilter/nf_tables.h
* the packet filter core: net/netfilter/nf_tables_core.c
* the expressions (described above): net/netfilter/nft_*.c
* the filter tables: arp, IPv4, IPv6 and bridge:
  net/ipv4/netfilter/nf_tables_ipv4.c
  net/ipv6/netfilter/nf_tables_ipv6.c
  net/ipv4/netfilter/nf_tables_arp.c
  net/bridge/netfilter/nf_tables_bridge.c
* the NAT table (IPv4 only):
  net/ipv4/netfilter/nf_table_nat_ipv4.c
* the route table (similar to mangle):
  net/ipv4/netfilter/nf_table_route_ipv4.c
  net/ipv6/netfilter/nf_table_route_ipv6.c
* internal definitions under:
  include/net/netfilter/nf_tables.h
  include/net/netfilter/nf_tables_core.h
* It also includes an skeleton expression:
  net/netfilter/nft_expr_template.c
  and the preliminary implementation of the meta target
  net/netfilter/nft_meta_target.c

It also includes a change in struct nf_hook_ops to add a new
pointer to store private data to the hook, that is used to store
the rule list per chain.

This patch is based on the patch from Patrick McHardy, plus merged
accumulated cleanups, fixes and small enhancements to the nftables
code that has been done since 2009, which are:

From Patrick McHardy:
* nf_tables: adjust netlink handler function signatures
* nf_tables: only retry table lookup after successful table module load
* nf_tables: fix event notification echo and avoid unnecessary messages
* nft_ct: add l3proto support
* nf_tables: pass expression context to nft_validate_data_load()
* nf_tables: remove redundant definition
* nft_ct: fix maxattr initialization
* nf_tables: fix invalid event type in nf_tables_getrule()
* nf_tables: simplify nft_data_init() usage
* nf_tables: build in more core modules
* nf_tables: fix double lookup expression unregistation
* nf_tables: move expression initialization to nf_tables_core.c
* nf_tables: build in payload module
* nf_tables: use NFPROTO constants
* nf_tables: rename pid variables to portid
* nf_tables: save 48 bits per rule
* nf_tables: introduce chain rename
* nf_tables: check for duplicate names on chain rename
* nf_tables: remove ability to specify handles for new rules
* nf_tables: return error for rule change request
* nf_tables: return error for NLM_F_REPLACE without rule handle
* nf_tables: include NLM_F_APPEND/NLM_F_REPLACE flags in rule notification
* nf_tables: fix NLM_F_MULTI usage in netlink notifications
* nf_tables: include NLM_F_APPEND in rule dumps

From Pablo Neira Ayuso:
* nf_tables: fix stack overflow in nf_tables_newrule
* nf_tables: nft_ct: fix compilation warning
* nf_tables: nft_ct: fix crash with invalid packets
* nft_log: group and qthreshold are 2^16
* nf_tables: nft_meta: fix socket uid,gid handling
* nft_counter: allow to restore counters
* nf_tables: fix module autoload
* nf_tables: allow to remove all rules placed in one chain
* nf_tables: use 64-bits rule handle instead of 16-bits
* nf_tables: fix chain after rule deletion
* nf_tables: improve deletion performance
* nf_tables: add missing code in route chain type
* nf_tables: rise maximum number of expressions from 12 to 128
* nf_tables: don't delete table if in use
* nf_tables: fix basechain release

From Tomasz Bursztyka:
* nf_tables: Add support for changing users chain's name
* nf_tables: Change chain's name to be fixed sized
* nf_tables: Add support for replacing a rule by another one
* nf_tables: Update uapi nftables netlink header documentation

From Florian Westphal:
* nft_log: group is u16, snaplen u32

From Phil Oester:
* nf_tables: operational limit match

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                          |   11 +-
 include/net/netfilter/nf_tables.h                  |  301 ++++
 include/net/netfilter/nf_tables_core.h             |   25 +
 include/uapi/linux/netfilter/Kbuild                |    1 +
 include/uapi/linux/netfilter/nf_conntrack_common.h |    4 +
 include/uapi/linux/netfilter/nf_tables.h           |  582 +++++++
 include/uapi/linux/netfilter/nfnetlink.h           |    5 +-
 net/bridge/netfilter/Kconfig                       |    3 +
 net/bridge/netfilter/Makefile                      |    2 +
 net/bridge/netfilter/nf_tables_bridge.c            |   37 +
 net/ipv4/netfilter/Kconfig                         |   16 +
 net/ipv4/netfilter/Makefile                        |    5 +
 net/ipv4/netfilter/nf_table_nat_ipv4.c             |  409 +++++
 net/ipv4/netfilter/nf_table_route_ipv4.c           |   97 ++
 net/ipv4/netfilter/nf_tables_ipv4.c                |   59 +
 net/ipv4/netfilter/nft_reject_ipv4.c               |  117 ++
 net/ipv6/netfilter/Kconfig                         |    8 +
 net/ipv6/netfilter/Makefile                        |    4 +
 net/ipv6/netfilter/nf_table_route_ipv6.c           |   93 ++
 net/ipv6/netfilter/nf_tables_ipv6.c                |   57 +
 net/netfilter/Kconfig                              |   37 +
 net/netfilter/Makefile                             |   16 +
 net/netfilter/nf_tables_api.c                      | 1760 ++++++++++++++++++++
 net/netfilter/nf_tables_core.c                     |  152 ++
 net/netfilter/nft_bitwise.c                        |  140 ++
 net/netfilter/nft_byteorder.c                      |  167 ++
 net/netfilter/nft_cmp.c                            |  146 ++
 net/netfilter/nft_counter.c                        |  107 ++
 net/netfilter/nft_ct.c                             |  252 +++
 net/netfilter/nft_expr_template.c                  |   88 +
 net/netfilter/nft_exthdr.c                         |  127 ++
 net/netfilter/nft_hash.c                           |  348 ++++
 net/netfilter/nft_immediate.c                      |  113 ++
 net/netfilter/nft_limit.c                          |  113 ++
 net/netfilter/nft_log.c                            |  140 ++
 net/netfilter/nft_meta.c                           |  222 +++
 net/netfilter/nft_meta_target.c                    |  117 ++
 net/netfilter/nft_payload.c                        |  137 ++
 net/netfilter/nft_set.c                            |  381 +++++
 39 files changed, 6393 insertions(+), 6 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables.h
 create mode 100644 include/net/netfilter/nf_tables_core.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables.h
 create mode 100644 net/bridge/netfilter/nf_tables_bridge.c
 create mode 100644 net/ipv4/netfilter/nf_table_nat_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_table_route_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_tables_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_table_route_ipv6.c
 create mode 100644 net/ipv6/netfilter/nf_tables_ipv6.c
 create mode 100644 net/netfilter/nf_tables_api.c
 create mode 100644 net/netfilter/nf_tables_core.c
 create mode 100644 net/netfilter/nft_bitwise.c
 create mode 100644 net/netfilter/nft_byteorder.c
 create mode 100644 net/netfilter/nft_cmp.c
 create mode 100644 net/netfilter/nft_counter.c
 create mode 100644 net/netfilter/nft_ct.c
 create mode 100644 net/netfilter/nft_expr_template.c
 create mode 100644 net/netfilter/nft_exthdr.c
 create mode 100644 net/netfilter/nft_hash.c
 create mode 100644 net/netfilter/nft_immediate.c
 create mode 100644 net/netfilter/nft_limit.c
 create mode 100644 net/netfilter/nft_log.c
 create mode 100644 net/netfilter/nft_meta.c
 create mode 100644 net/netfilter/nft_meta_target.c
 create mode 100644 net/netfilter/nft_payload.c
 create mode 100644 net/netfilter/nft_set.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index fef7e67f7101..2077489f9887 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -53,12 +53,13 @@ struct nf_hook_ops {
 	struct list_head list;
 
 	/* User fills in from here down. */
-	nf_hookfn *hook;
-	struct module *owner;
-	u_int8_t pf;
-	unsigned int hooknum;
+	nf_hookfn	*hook;
+	struct module	*owner;
+	void		*priv;
+	u_int8_t	pf;
+	unsigned int	hooknum;
 	/* Hooks are ordered in ascending priority. */
-	int priority;
+	int		priority;
 };
 
 struct nf_sockopt_ops {
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
new file mode 100644
index 000000000000..d26dfa345f49
--- /dev/null
+++ b/include/net/netfilter/nf_tables.h
@@ -0,0 +1,301 @@
+#ifndef _NET_NF_TABLES_H
+#define _NET_NF_TABLES_H
+
+#include <linux/list.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netlink.h>
+
+struct nft_pktinfo {
+	struct sk_buff			*skb;
+	const struct net_device		*in;
+	const struct net_device		*out;
+	u8				hooknum;
+	u8				nhoff;
+	u8				thoff;
+};
+
+struct nft_data {
+	union {
+		u32				data[4];
+		struct {
+			u32			verdict;
+			struct nft_chain	*chain;
+		};
+	};
+} __attribute__((aligned(__alignof__(u64))));
+
+static inline int nft_data_cmp(const struct nft_data *d1,
+			       const struct nft_data *d2,
+			       unsigned int len)
+{
+	return memcmp(d1->data, d2->data, len);
+}
+
+static inline void nft_data_copy(struct nft_data *dst,
+				 const struct nft_data *src)
+{
+	BUILD_BUG_ON(__alignof__(*dst) != __alignof__(u64));
+	*(u64 *)&dst->data[0] = *(u64 *)&src->data[0];
+	*(u64 *)&dst->data[2] = *(u64 *)&src->data[2];
+}
+
+static inline void nft_data_debug(const struct nft_data *data)
+{
+	pr_debug("data[0]=%x data[1]=%x data[2]=%x data[3]=%x\n",
+		 data->data[0], data->data[1],
+		 data->data[2], data->data[3]);
+}
+
+/**
+ *	struct nft_ctx - nf_tables rule context
+ *
+ * 	@afi: address family info
+ * 	@table: the table the chain is contained in
+ * 	@chain: the chain the rule is contained in
+ */
+struct nft_ctx {
+	const struct nft_af_info	*afi;
+	const struct nft_table		*table;
+	const struct nft_chain		*chain;
+};
+
+enum nft_data_types {
+	NFT_DATA_VALUE,
+	NFT_DATA_VERDICT,
+};
+
+struct nft_data_desc {
+	enum nft_data_types		type;
+	unsigned int			len;
+};
+
+extern int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+			 struct nft_data_desc *desc, const struct nlattr *nla);
+extern void nft_data_uninit(const struct nft_data *data,
+			    enum nft_data_types type);
+extern int nft_data_dump(struct sk_buff *skb, int attr,
+			 const struct nft_data *data,
+			 enum nft_data_types type, unsigned int len);
+
+static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
+{
+	return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
+}
+
+extern int nft_validate_input_register(enum nft_registers reg);
+extern int nft_validate_output_register(enum nft_registers reg);
+extern int nft_validate_data_load(const struct nft_ctx *ctx,
+				  enum nft_registers reg,
+				  const struct nft_data *data,
+				  enum nft_data_types type);
+
+/**
+ *	struct nft_expr_ops - nf_tables expression operations
+ *
+ *	@eval: Expression evaluation function
+ *	@init: initialization function
+ *	@destroy: destruction function
+ *	@dump: function to dump parameters
+ *	@list: used internally
+ *	@name: Identifier
+ *	@owner: module reference
+ *	@policy: netlink attribute policy
+ *	@maxattr: highest netlink attribute number
+ *	@size: full expression size, including private data size
+ */
+struct nft_expr;
+struct nft_expr_ops {
+	void				(*eval)(const struct nft_expr *expr,
+						struct nft_data data[NFT_REG_MAX + 1],
+						const struct nft_pktinfo *pkt);
+	int				(*init)(const struct nft_ctx *ctx,
+						const struct nft_expr *expr,
+						const struct nlattr * const tb[]);
+	void				(*destroy)(const struct nft_expr *expr);
+	int				(*dump)(struct sk_buff *skb,
+						const struct nft_expr *expr);
+
+	struct list_head		list;
+	const char			*name;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	unsigned int			maxattr;
+	unsigned int			size;
+};
+
+#define NFT_EXPR_SIZE(size)		(sizeof(struct nft_expr) + \
+					 ALIGN(size, __alignof__(struct nft_expr)))
+
+/**
+ *	struct nft_expr - nf_tables expression
+ *
+ *	@ops: expression ops
+ *	@data: expression private data
+ */
+struct nft_expr {
+	const struct nft_expr_ops	*ops;
+	unsigned char			data[];
+};
+
+static inline void *nft_expr_priv(const struct nft_expr *expr)
+{
+	return (void *)expr->data;
+}
+
+/**
+ *	struct nft_rule - nf_tables rule
+ *
+ *	@list: used internally
+ *	@rcu_head: used internally for rcu
+ *	@handle: rule handle
+ *	@dlen: length of expression data
+ *	@data: expression data
+ */
+struct nft_rule {
+	struct list_head		list;
+	struct rcu_head			rcu_head;
+	u64				handle:48,
+					dlen:16;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(struct nft_expr))));
+};
+
+static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
+{
+	return (struct nft_expr *)&rule->data[0];
+}
+
+static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr)
+{
+	return ((void *)expr) + expr->ops->size;
+}
+
+static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
+{
+	return (struct nft_expr *)&rule->data[rule->dlen];
+}
+
+/*
+ * The last pointer isn't really necessary, but the compiler isn't able to
+ * determine that the result of nft_expr_last() is always the same since it
+ * can't assume that the dlen value wasn't changed within calls in the loop.
+ */
+#define nft_rule_for_each_expr(expr, last, rule) \
+	for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \
+	     (expr) != (last); \
+	     (expr) = nft_expr_next(expr))
+
+enum nft_chain_flags {
+	NFT_BASE_CHAIN			= 0x1,
+	NFT_CHAIN_BUILTIN		= 0x2,
+};
+
+/**
+ *	struct nft_chain - nf_tables chain
+ *
+ *	@rules: list of rules in the chain
+ *	@list: used internally
+ *	@rcu_head: used internally
+ *	@handle: chain handle
+ *	@flags: bitmask of enum nft_chain_flags
+ *	@use: number of jump references to this chain
+ *	@level: length of longest path to this chain
+ *	@name: name of the chain
+ */
+struct nft_chain {
+	struct list_head		rules;
+	struct list_head		list;
+	struct rcu_head			rcu_head;
+	u64				handle;
+	u8				flags;
+	u16				use;
+	u16				level;
+	char				name[NFT_CHAIN_MAXNAMELEN];
+};
+
+/**
+ *	struct nft_base_chain - nf_tables base chain
+ *
+ *	@ops: netfilter hook ops
+ *	@chain: the chain
+ */
+struct nft_base_chain {
+	struct nf_hook_ops		ops;
+	struct nft_chain		chain;
+};
+
+static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
+{
+	return container_of(chain, struct nft_base_chain, chain);
+}
+
+extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *));
+
+enum nft_table_flags {
+	NFT_TABLE_BUILTIN		= 0x1,
+};
+
+/**
+ *	struct nft_table - nf_tables table
+ *
+ *	@list: used internally
+ *	@chains: chains in the table
+ *	@sets: sets in the table
+ *	@hgenerator: handle generator state
+ *	@use: number of chain references to this table
+ *	@flags: table flag (see enum nft_table_flags)
+ *	@name: name of the table
+ */
+struct nft_table {
+	struct list_head		list;
+	struct list_head		chains;
+	struct list_head		sets;
+	u64				hgenerator;
+	u32				use;
+	u16				flags;
+	char				name[];
+};
+
+/**
+ *	struct nft_af_info - nf_tables address family info
+ *
+ *	@list: used internally
+ *	@family: address family
+ *	@nhooks: number of hooks in this family
+ *	@owner: module owner
+ *	@tables: used internally
+ *	@hooks: hookfn overrides for packet validation
+ */
+struct nft_af_info {
+	struct list_head		list;
+	int				family;
+	unsigned int			nhooks;
+	struct module			*owner;
+	struct list_head		tables;
+	nf_hookfn			*hooks[NF_MAX_HOOKS];
+};
+
+extern int nft_register_afinfo(struct nft_af_info *);
+extern void nft_unregister_afinfo(struct nft_af_info *);
+
+extern int nft_register_table(struct nft_table *, int family);
+extern void nft_unregister_table(struct nft_table *, int family);
+
+extern int nft_register_expr(struct nft_expr_ops *);
+extern void nft_unregister_expr(struct nft_expr_ops *);
+
+#define MODULE_ALIAS_NFT_FAMILY(family)	\
+	MODULE_ALIAS("nft-afinfo-" __stringify(family))
+
+#define MODULE_ALIAS_NFT_TABLE(family, name) \
+	MODULE_ALIAS("nft-table-" __stringify(family) "-" name)
+
+#define MODULE_ALIAS_NFT_EXPR(name) \
+	MODULE_ALIAS("nft-expr-" name)
+
+#endif /* _NET_NF_TABLES_H */
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
new file mode 100644
index 000000000000..283396c916e0
--- /dev/null
+++ b/include/net/netfilter/nf_tables_core.h
@@ -0,0 +1,25 @@
+#ifndef _NET_NF_TABLES_CORE_H
+#define _NET_NF_TABLES_CORE_H
+
+extern int nf_tables_core_module_init(void);
+extern void nf_tables_core_module_exit(void);
+
+extern int nft_immediate_module_init(void);
+extern void nft_immediate_module_exit(void);
+
+extern int nft_cmp_module_init(void);
+extern void nft_cmp_module_exit(void);
+
+extern int nft_lookup_module_init(void);
+extern void nft_lookup_module_exit(void);
+
+extern int nft_bitwise_module_init(void);
+extern void nft_bitwise_module_exit(void);
+
+extern int nft_byteorder_module_init(void);
+extern void nft_byteorder_module_exit(void);
+
+extern int nft_payload_module_init(void);
+extern void nft_payload_module_exit(void);
+
+#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 174915420d3f..6ce0b7f566a7 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h
 header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
+header-y += nf_tables.h
 header-y += nf_nat.h
 header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 8dd803818ebe..319f47128db8 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -25,6 +25,10 @@ enum ip_conntrack_info {
 	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
 };
 
+#define NF_CT_STATE_INVALID_BIT			(1 << 0)
+#define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_NUMBER + 1))
+
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {
 	/* It's an expected connection: bit 0 set.  This bit never changed */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
new file mode 100644
index 000000000000..ec6d84a8ed1e
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -0,0 +1,582 @@
+#ifndef _LINUX_NF_TABLES_H
+#define _LINUX_NF_TABLES_H
+
+#define NFT_CHAIN_MAXNAMELEN 32
+
+enum nft_registers {
+	NFT_REG_VERDICT,
+	NFT_REG_1,
+	NFT_REG_2,
+	NFT_REG_3,
+	NFT_REG_4,
+	__NFT_REG_MAX
+};
+#define NFT_REG_MAX	(__NFT_REG_MAX - 1)
+
+/**
+ * enum nft_verdicts - nf_tables internal verdicts
+ *
+ * @NFT_CONTINUE: continue evaluation of the current rule
+ * @NFT_BREAK: terminate evaluation of the current rule
+ * @NFT_JUMP: push the current chain on the jump stack and jump to a chain
+ * @NFT_GOTO: jump to a chain without pushing the current chain on the jump stack
+ * @NFT_RETURN: return to the topmost chain on the jump stack
+ *
+ * The nf_tables verdicts share their numeric space with the netfilter verdicts.
+ */
+enum nft_verdicts {
+	NFT_CONTINUE	= -1,
+	NFT_BREAK	= -2,
+	NFT_JUMP	= -3,
+	NFT_GOTO	= -4,
+	NFT_RETURN	= -5,
+};
+
+/**
+ * enum nf_tables_msg_types - nf_tables netlink message types
+ *
+ * @NFT_MSG_NEWTABLE: create a new table (enum nft_table_attributes)
+ * @NFT_MSG_GETTABLE: get a table (enum nft_table_attributes)
+ * @NFT_MSG_DELTABLE: delete a table (enum nft_table_attributes)
+ * @NFT_MSG_NEWCHAIN: create a new chain (enum nft_chain_attributes)
+ * @NFT_MSG_GETCHAIN: get a chain (enum nft_chain_attributes)
+ * @NFT_MSG_DELCHAIN: delete a chain (enum nft_chain_attributes)
+ * @NFT_MSG_NEWRULE: create a new rule (enum nft_rule_attributes)
+ * @NFT_MSG_GETRULE: get a rule (enum nft_rule_attributes)
+ * @NFT_MSG_DELRULE: delete a rule (enum nft_rule_attributes)
+ */
+enum nf_tables_msg_types {
+	NFT_MSG_NEWTABLE,
+	NFT_MSG_GETTABLE,
+	NFT_MSG_DELTABLE,
+	NFT_MSG_NEWCHAIN,
+	NFT_MSG_GETCHAIN,
+	NFT_MSG_DELCHAIN,
+	NFT_MSG_NEWRULE,
+	NFT_MSG_GETRULE,
+	NFT_MSG_DELRULE,
+	NFT_MSG_MAX,
+};
+
+enum nft_list_attributes {
+	NFTA_LIST_UNPEC,
+	NFTA_LIST_ELEM,
+	__NFTA_LIST_MAX
+};
+#define NFTA_LIST_MAX		(__NFTA_LIST_MAX - 1)
+
+/**
+ * enum nft_hook_attributes - nf_tables netfilter hook netlink attributes
+ *
+ * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32)
+ * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
+ */
+enum nft_hook_attributes {
+	NFTA_HOOK_UNSPEC,
+	NFTA_HOOK_HOOKNUM,
+	NFTA_HOOK_PRIORITY,
+	__NFTA_HOOK_MAX
+};
+#define NFTA_HOOK_MAX		(__NFTA_HOOK_MAX - 1)
+
+/**
+ * enum nft_table_attributes - nf_tables table netlink attributes
+ *
+ * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
+ */
+enum nft_table_attributes {
+	NFTA_TABLE_UNSPEC,
+	NFTA_TABLE_NAME,
+	__NFTA_TABLE_MAX
+};
+#define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
+
+/**
+ * enum nft_chain_attributes - nf_tables chain netlink attributes
+ *
+ * @NFTA_CHAIN_TABLE: name of the table containing the chain (NLA_STRING)
+ * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
+ * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
+ * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ */
+enum nft_chain_attributes {
+	NFTA_CHAIN_UNSPEC,
+	NFTA_CHAIN_TABLE,
+	NFTA_CHAIN_HANDLE,
+	NFTA_CHAIN_NAME,
+	NFTA_CHAIN_HOOK,
+	__NFTA_CHAIN_MAX
+};
+#define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
+
+/**
+ * enum nft_rule_attributes - nf_tables rule netlink attributes
+ *
+ * @NFTA_RULE_TABLE: name of the table containing the rule (NLA_STRING)
+ * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING)
+ * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
+ * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
+ */
+enum nft_rule_attributes {
+	NFTA_RULE_UNSPEC,
+	NFTA_RULE_TABLE,
+	NFTA_RULE_CHAIN,
+	NFTA_RULE_HANDLE,
+	NFTA_RULE_EXPRESSIONS,
+	__NFTA_RULE_MAX
+};
+#define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
+
+enum nft_data_attributes {
+	NFTA_DATA_UNSPEC,
+	NFTA_DATA_VALUE,
+	NFTA_DATA_VERDICT,
+	__NFTA_DATA_MAX
+};
+#define NFTA_DATA_MAX		(__NFTA_DATA_MAX - 1)
+
+/**
+ * enum nft_verdict_attributes - nf_tables verdict netlink attributes
+ *
+ * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts)
+ * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING)
+ */
+enum nft_verdict_attributes {
+	NFTA_VERDICT_UNSPEC,
+	NFTA_VERDICT_CODE,
+	NFTA_VERDICT_CHAIN,
+	__NFTA_VERDICT_MAX
+};
+#define NFTA_VERDICT_MAX	(__NFTA_VERDICT_MAX - 1)
+
+/**
+ * enum nft_expr_attributes - nf_tables expression netlink attributes
+ *
+ * @NFTA_EXPR_NAME: name of the expression type (NLA_STRING)
+ * @NFTA_EXPR_DATA: type specific data (NLA_NESTED)
+ */
+enum nft_expr_attributes {
+	NFTA_EXPR_UNSPEC,
+	NFTA_EXPR_NAME,
+	NFTA_EXPR_DATA,
+	__NFTA_EXPR_MAX
+};
+#define NFTA_EXPR_MAX		(__NFTA_EXPR_MAX - 1)
+
+/**
+ * enum nft_immediate_attributes - nf_tables immediate expression netlink attributes
+ *
+ * @NFTA_IMMEDIATE_DREG: destination register to load data into (NLA_U32)
+ * @NFTA_IMMEDIATE_DATA: data to load (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_immediate_attributes {
+	NFTA_IMMEDIATE_UNSPEC,
+	NFTA_IMMEDIATE_DREG,
+	NFTA_IMMEDIATE_DATA,
+	__NFTA_IMMEDIATE_MAX
+};
+#define NFTA_IMMEDIATE_MAX	(__NFTA_IMMEDIATE_MAX - 1)
+
+/**
+ * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes
+ *
+ * @NFTA_BITWISE_SREG: source register (NLA_U32: nft_registers)
+ * @NFTA_BITWISE_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_BITWISE_LEN: length of operands (NLA_U32)
+ * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes)
+ * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes)
+ *
+ * The bitwise expression performs the following operation:
+ *
+ * dreg = (sreg & mask) ^ xor
+ *
+ * which allow to express all bitwise operations:
+ *
+ * 		mask	xor
+ * NOT:		1	1
+ * OR:		0	x
+ * XOR:		1	x
+ * AND:		x	0
+ */
+enum nft_bitwise_attributes {
+	NFTA_BITWISE_UNSPEC,
+	NFTA_BITWISE_SREG,
+	NFTA_BITWISE_DREG,
+	NFTA_BITWISE_LEN,
+	NFTA_BITWISE_MASK,
+	NFTA_BITWISE_XOR,
+	__NFTA_BITWISE_MAX
+};
+#define NFTA_BITWISE_MAX	(__NFTA_BITWISE_MAX - 1)
+
+/**
+ * enum nft_byteorder_ops - nf_tables byteorder operators
+ *
+ * @NFT_BYTEORDER_NTOH: network to host operator
+ * @NFT_BYTEORDER_HTON: host to network opertaor
+ */
+enum nft_byteorder_ops {
+	NFT_BYTEORDER_NTOH,
+	NFT_BYTEORDER_HTON,
+};
+
+/**
+ * enum nft_byteorder_attributes - nf_tables byteorder expression netlink attributes
+ *
+ * @NFTA_BYTEORDER_SREG: source register (NLA_U32: nft_registers)
+ * @NFTA_BYTEORDER_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_BYTEORDER_OP: operator (NLA_U32: enum nft_byteorder_ops)
+ * @NFTA_BYTEORDER_LEN: length of the data (NLA_U32)
+ * @NFTA_BYTEORDER_SIZE: data size in bytes (NLA_U32: 2 or 4)
+ */
+enum nft_byteorder_attributes {
+	NFTA_BYTEORDER_UNSPEC,
+	NFTA_BYTEORDER_SREG,
+	NFTA_BYTEORDER_DREG,
+	NFTA_BYTEORDER_OP,
+	NFTA_BYTEORDER_LEN,
+	NFTA_BYTEORDER_SIZE,
+	__NFTA_BYTEORDER_MAX
+};
+#define NFTA_BYTEORDER_MAX	(__NFTA_BYTEORDER_MAX - 1)
+
+/**
+ * enum nft_cmp_ops - nf_tables relational operator
+ *
+ * @NFT_CMP_EQ: equal
+ * @NFT_CMP_NEQ: not equal
+ * @NFT_CMP_LT: less than
+ * @NFT_CMP_LTE: less than or equal to
+ * @NFT_CMP_GT: greater than
+ * @NFT_CMP_GTE: greater than or equal to
+ */
+enum nft_cmp_ops {
+	NFT_CMP_EQ,
+	NFT_CMP_NEQ,
+	NFT_CMP_LT,
+	NFT_CMP_LTE,
+	NFT_CMP_GT,
+	NFT_CMP_GTE,
+};
+
+/**
+ * enum nft_cmp_attributes - nf_tables cmp expression netlink attributes
+ *
+ * @NFTA_CMP_SREG: source register of data to compare (NLA_U32: nft_registers)
+ * @NFTA_CMP_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_CMP_DATA: data to compare against (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_cmp_attributes {
+	NFTA_CMP_UNSPEC,
+	NFTA_CMP_SREG,
+	NFTA_CMP_OP,
+	NFTA_CMP_DATA,
+	__NFTA_CMP_MAX
+};
+#define NFTA_CMP_MAX		(__NFTA_CMP_MAX - 1)
+
+enum nft_set_elem_flags {
+	NFT_SE_INTERVAL_END	= 0x1,
+};
+
+enum nft_set_elem_attributes {
+	NFTA_SE_UNSPEC,
+	NFTA_SE_KEY,
+	NFTA_SE_DATA,
+	NFTA_SE_FLAGS,
+	__NFTA_SE_MAX
+};
+#define NFTA_SE_MAX		(__NFTA_SE_MAX - 1)
+
+enum nft_set_flags {
+	NFT_SET_INTERVAL	= 0x1,
+	NFT_SET_MAP		= 0x2,
+};
+
+enum nft_set_attributes {
+	NFTA_SET_UNSPEC,
+	NFTA_SET_FLAGS,
+	NFTA_SET_SREG,
+	NFTA_SET_DREG,
+	NFTA_SET_KLEN,
+	NFTA_SET_DLEN,
+	NFTA_SET_ELEMENTS,
+	__NFTA_SET_MAX
+};
+#define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
+
+enum nft_hash_flags {
+	NFT_HASH_MAP		= 0x1,
+};
+
+enum nft_hash_elem_attributes {
+	NFTA_HE_UNSPEC,
+	NFTA_HE_KEY,
+	NFTA_HE_DATA,
+	__NFTA_HE_MAX
+};
+#define NFTA_HE_MAX		(__NFTA_HE_MAX - 1)
+
+enum nft_hash_attributes {
+	NFTA_HASH_UNSPEC,
+	NFTA_HASH_FLAGS,
+	NFTA_HASH_SREG,
+	NFTA_HASH_DREG,
+	NFTA_HASH_KLEN,
+	NFTA_HASH_ELEMENTS,
+	__NFTA_HASH_MAX
+};
+#define NFTA_HASH_MAX		(__NFTA_HASH_MAX - 1)
+
+/**
+ * enum nft_payload_bases - nf_tables payload expression offset bases
+ *
+ * @NFT_PAYLOAD_LL_HEADER: link layer header
+ * @NFT_PAYLOAD_NETWORK_HEADER: network header
+ * @NFT_PAYLOAD_TRANSPORT_HEADER: transport header
+ */
+enum nft_payload_bases {
+	NFT_PAYLOAD_LL_HEADER,
+	NFT_PAYLOAD_NETWORK_HEADER,
+	NFT_PAYLOAD_TRANSPORT_HEADER,
+};
+
+/**
+ * enum nft_payload_attributes - nf_tables payload expression netlink attributes
+ *
+ * @NFTA_PAYLOAD_DREG: destination register to load data into (NLA_U32: nft_registers)
+ * @NFTA_PAYLOAD_BASE: payload base (NLA_U32: nft_payload_bases)
+ * @NFTA_PAYLOAD_OFFSET: payload offset relative to base (NLA_U32)
+ * @NFTA_PAYLOAD_LEN: payload length (NLA_U32)
+ */
+enum nft_payload_attributes {
+	NFTA_PAYLOAD_UNSPEC,
+	NFTA_PAYLOAD_DREG,
+	NFTA_PAYLOAD_BASE,
+	NFTA_PAYLOAD_OFFSET,
+	NFTA_PAYLOAD_LEN,
+	__NFTA_PAYLOAD_MAX
+};
+#define NFTA_PAYLOAD_MAX	(__NFTA_PAYLOAD_MAX - 1)
+
+/**
+ * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
+ *
+ * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
+ * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
+ * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
+ */
+enum nft_exthdr_attributes {
+	NFTA_EXTHDR_UNSPEC,
+	NFTA_EXTHDR_DREG,
+	NFTA_EXTHDR_TYPE,
+	NFTA_EXTHDR_OFFSET,
+	NFTA_EXTHDR_LEN,
+	__NFTA_EXTHDR_MAX
+};
+#define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
+
+/**
+ * enum nft_meta_keys - nf_tables meta expression keys
+ *
+ * @NFT_META_LEN: packet length (skb->len)
+ * @NFT_META_PROTOCOL: packet ethertype protocol (skb->protocol), invalid in OUTPUT
+ * @NFT_META_PRIORITY: packet priority (skb->priority)
+ * @NFT_META_MARK: packet mark (skb->mark)
+ * @NFT_META_IIF: packet input interface index (dev->ifindex)
+ * @NFT_META_OIF: packet output interface index (dev->ifindex)
+ * @NFT_META_IIFNAME: packet input interface name (dev->name)
+ * @NFT_META_OIFNAME: packet output interface name (dev->name)
+ * @NFT_META_IIFTYPE: packet input interface type (dev->type)
+ * @NFT_META_OIFTYPE: packet output interface type (dev->type)
+ * @NFT_META_SKUID: originating socket UID (fsuid)
+ * @NFT_META_SKGID: originating socket GID (fsgid)
+ * @NFT_META_NFTRACE: packet nftrace bit
+ * @NFT_META_RTCLASSID: realm value of packet's route (skb->dst->tclassid)
+ * @NFT_META_SECMARK: packet secmark (skb->secmark)
+ */
+enum nft_meta_keys {
+	NFT_META_LEN,
+	NFT_META_PROTOCOL,
+	NFT_META_PRIORITY,
+	NFT_META_MARK,
+	NFT_META_IIF,
+	NFT_META_OIF,
+	NFT_META_IIFNAME,
+	NFT_META_OIFNAME,
+	NFT_META_IIFTYPE,
+	NFT_META_OIFTYPE,
+	NFT_META_SKUID,
+	NFT_META_SKGID,
+	NFT_META_NFTRACE,
+	NFT_META_RTCLASSID,
+	NFT_META_SECMARK,
+};
+
+/**
+ * enum nft_meta_attributes - nf_tables meta expression netlink attributes
+ *
+ * @NFTA_META_DREG: destination register (NLA_U32)
+ * @NFTA_META_KEY: meta data item to load (NLA_U32: nft_meta_keys)
+ */
+enum nft_meta_attributes {
+	NFTA_META_UNSPEC,
+	NFTA_META_DREG,
+	NFTA_META_KEY,
+	__NFTA_META_MAX
+};
+#define NFTA_META_MAX		(__NFTA_META_MAX - 1)
+
+/**
+ * enum nft_ct_keys - nf_tables ct expression keys
+ *
+ * @NFT_CT_STATE: conntrack state (bitmask of enum ip_conntrack_info)
+ * @NFT_CT_DIRECTION: conntrack direction (enum ip_conntrack_dir)
+ * @NFT_CT_STATUS: conntrack status (bitmask of enum ip_conntrack_status)
+ * @NFT_CT_MARK: conntrack mark value
+ * @NFT_CT_SECMARK: conntrack secmark value
+ * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms
+ * @NFT_CT_HELPER: connection tracking helper assigned to conntrack
+ * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol
+ * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address)
+ * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address)
+ * @NFT_CT_PROTOCOL: conntrack layer 4 protocol
+ * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source
+ * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination
+ */
+enum nft_ct_keys {
+	NFT_CT_STATE,
+	NFT_CT_DIRECTION,
+	NFT_CT_STATUS,
+	NFT_CT_MARK,
+	NFT_CT_SECMARK,
+	NFT_CT_EXPIRATION,
+	NFT_CT_HELPER,
+	NFT_CT_L3PROTOCOL,
+	NFT_CT_SRC,
+	NFT_CT_DST,
+	NFT_CT_PROTOCOL,
+	NFT_CT_PROTO_SRC,
+	NFT_CT_PROTO_DST,
+};
+
+/**
+ * enum nft_ct_attributes - nf_tables ct expression netlink attributes
+ *
+ * @NFTA_CT_DREG: destination register (NLA_U32)
+ * @NFTA_CT_KEY: conntrack data item to load (NLA_U32: nft_ct_keys)
+ * @NFTA_CT_DIRECTION: direction in case of directional keys (NLA_U8)
+ */
+enum nft_ct_attributes {
+	NFTA_CT_UNSPEC,
+	NFTA_CT_DREG,
+	NFTA_CT_KEY,
+	NFTA_CT_DIRECTION,
+	__NFTA_CT_MAX
+};
+#define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
+
+/**
+ * enum nft_limit_attributes - nf_tables limit expression netlink attributes
+ *
+ * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
+ * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
+ */
+enum nft_limit_attributes {
+	NFTA_LIMIT_UNSPEC,
+	NFTA_LIMIT_RATE,
+	NFTA_LIMIT_UNIT,
+	__NFTA_LIMIT_MAX
+};
+#define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
+
+/**
+ * enum nft_counter_attributes - nf_tables counter expression netlink attributes
+ *
+ * @NFTA_COUNTER_BYTES: number of bytes (NLA_U64)
+ * @NFTA_COUNTER_PACKETS: number of packets (NLA_U64)
+ */
+enum nft_counter_attributes {
+	NFTA_COUNTER_UNSPEC,
+	NFTA_COUNTER_BYTES,
+	NFTA_COUNTER_PACKETS,
+	__NFTA_COUNTER_MAX
+};
+#define NFTA_COUNTER_MAX	(__NFTA_COUNTER_MAX - 1)
+
+/**
+ * enum nft_log_attributes - nf_tables log expression netlink attributes
+ *
+ * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32)
+ * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING)
+ * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32)
+ * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32)
+ */
+enum nft_log_attributes {
+	NFTA_LOG_UNSPEC,
+	NFTA_LOG_GROUP,
+	NFTA_LOG_PREFIX,
+	NFTA_LOG_SNAPLEN,
+	NFTA_LOG_QTHRESHOLD,
+	__NFTA_LOG_MAX
+};
+#define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
+
+/**
+ * enum nft_reject_types - nf_tables reject expression reject types
+ *
+ * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable
+ * @NFT_REJECT_TCP_RST: reject using TCP RST
+ */
+enum nft_reject_types {
+	NFT_REJECT_ICMP_UNREACH,
+	NFT_REJECT_TCP_RST,
+};
+
+/**
+ * enum nft_reject_attributes - nf_tables reject expression netlink attributes
+ *
+ * @NFTA_REJECT_TYPE: packet type to use (NLA_U32: nft_reject_types)
+ * @NFTA_REJECT_ICMP_CODE: ICMP code to use (NLA_U8)
+ */
+enum nft_reject_attributes {
+	NFTA_REJECT_UNSPEC,
+	NFTA_REJECT_TYPE,
+	NFTA_REJECT_ICMP_CODE,
+	__NFTA_REJECT_MAX
+};
+#define NFTA_REJECT_MAX		(__NFTA_REJECT_MAX - 1)
+
+/**
+ * enum nft_nat_types - nf_tables nat expression NAT types
+ *
+ * @NFT_NAT_SNAT: source NAT
+ * @NFT_NAT_DNAT: destination NAT
+ */
+enum nft_nat_types {
+	NFT_NAT_SNAT,
+	NFT_NAT_DNAT,
+};
+
+/**
+ * enum nft_nat_attributes - nf_tables nat expression netlink attributes
+ *
+ * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types)
+ * @NFTA_NAT_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
+ */
+enum nft_nat_attributes {
+	NFTA_NAT_UNSPEC,
+	NFTA_NAT_TYPE,
+	NFTA_NAT_ADDR_MIN,
+	NFTA_NAT_ADDR_MAX,
+	NFTA_NAT_PROTO_MIN,
+	NFTA_NAT_PROTO_MAX,
+	__NFTA_NAT_MAX
+};
+#define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
+
+#endif /* _LINUX_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 4a4efafad5f4..d276c3bd55b8 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -18,6 +18,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_CONNTRACK_EXP_UPDATE	NFNLGRP_CONNTRACK_EXP_UPDATE
 	NFNLGRP_CONNTRACK_EXP_DESTROY,
 #define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
+	NFNLGRP_NFTABLES,
+#define NFNLGRP_NFTABLES                NFNLGRP_NFTABLES
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
@@ -51,6 +53,7 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_ACCT		7
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
 #define NFNL_SUBSYS_CTHELPER		9
-#define NFNL_SUBSYS_COUNT		10
+#define NFNL_SUBSYS_NFTABLES		10
+#define NFNL_SUBSYS_COUNT		11
 
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index a9aff9c7d027..68f8128147be 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -1,6 +1,9 @@
 #
 # Bridge netfilter configuration
 #
+#
+config NF_TABLES_BRIDGE
+	tristate "Ethernet Bridge nf_tables support"
 
 menuconfig BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 0718699540b0..ea7629f58b3d 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
+obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o
+
 obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
 
 # tables
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
new file mode 100644
index 000000000000..bc5c21c911c0
--- /dev/null
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_tables.h>
+
+static struct nft_af_info nft_af_bridge __read_mostly = {
+	.family		= NFPROTO_BRIDGE,
+	.nhooks		= NF_BR_NUMHOOKS,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nf_tables_bridge_init(void)
+{
+	return nft_register_afinfo(&nft_af_bridge);
+}
+
+static void __exit nf_tables_bridge_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_bridge);
+}
+
+module_init(nf_tables_bridge_init);
+module_exit(nf_tables_bridge_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1657e39b291f..eb1d56ece361 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,22 @@ config NF_CONNTRACK_PROC_COMPAT
 
 	  If unsure, say Y.
 
+config NF_TABLES_IPV4
+	depends on NF_TABLES
+	tristate "IPv4 nf_tables support"
+
+config NFT_REJECT_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "nf_tables IPv4 reject support"
+
+config NF_TABLE_ROUTE_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "IPv4 nf_tables route table support"
+
+config NF_TABLE_NAT_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "IPv4 nf_tables nat table support"
+
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3622b248b6dd..b2f01cd2cd65 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -27,6 +27,11 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NF_TABLE_ROUTE_IPV4) += nf_table_route_ipv4.o
+obj-$(CONFIG_NF_TABLE_NAT_IPV4) += nf_table_nat_ipv4.o
+
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nf_table_nat_ipv4.c
new file mode 100644
index 000000000000..2a6f184c10bd
--- /dev/null
+++ b/net/ipv4/netfilter/nf_table_nat_ipv4.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers	sreg_addr_min:8;
+	enum nft_registers	sreg_addr_max:8;
+	enum nft_registers	sreg_proto_min:8;
+	enum nft_registers	sreg_proto_max:8;
+	enum nf_nat_manip_type	type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_nat_ops __read_mostly = {
+	.name		= "nat",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_nat_eval,
+	.init		= nft_nat_init,
+	.dump		= nft_nat_dump,
+	.policy		= nft_nat_policy,
+	.maxattr	= NFTA_NAT_MAX,
+};
+
+/*
+ * NAT table
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		ret = nft_do_chain(ops, skb, in, out, okfn);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	__be32 daddr = ip_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ip_hdr(skb)->daddr != daddr) {
+		skb_dst_drop(skb);
+	}
+	return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.u3.ip !=
+		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
+		    ct->tuplehash[dir].tuple.src.u.all !=
+		    ct->tuplehash[!dir].tuple.dst.u.all)
+			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+								ret : NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_nat_prerouting __read_mostly = {
+	.chain	= {
+		.name		= "PREROUTING",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_prerouting.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_prerouting,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+		.priv		= &nf_chain_nat_prerouting.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_postrouting __read_mostly = {
+	.chain	= {
+		.name		= "POSTROUTING",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_postrouting.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_postrouting,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+		.priv		= &nf_chain_nat_postrouting.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_output,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+		.priv		= &nf_chain_nat_output.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_input __read_mostly = {
+	.chain	= {
+		.name		= "INPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_input.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+		.priv		= &nf_chain_nat_input.chain,
+	},
+};
+
+
+static struct nft_table nf_table_nat_ipv4 __read_mostly = {
+	.name	= "nat",
+	.chains	= LIST_HEAD_INIT(nf_table_nat_ipv4.chains),
+};
+
+static int __init nf_table_nat_init(void)
+{
+	int err;
+
+	list_add_tail(&nf_chain_nat_prerouting.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_postrouting.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_output.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_input.chain.list,
+		      &nf_table_nat_ipv4.chains);
+
+	err = nft_register_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
+	if (err < 0)
+		goto err1;
+
+	err = nft_register_expr(&nft_nat_ops);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nft_unregister_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
+err1:
+	return err;
+}
+
+static void __exit nf_table_nat_exit(void)
+{
+	nft_unregister_expr(&nft_nat_ops);
+	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
+}
+
+module_init(nf_table_nat_init);
+module_exit(nf_table_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET, "nat");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nf_table_route_ipv4.c b/net/ipv4/netfilter/nf_table_route_ipv4.c
new file mode 100644
index 000000000000..4f257a1ed661
--- /dev/null
+++ b/net/ipv4/netfilter/nf_table_route_ipv4.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	u32 mark;
+	__be32 saddr, daddr;
+	u_int8_t tos;
+	const struct iphdr *iph;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos)
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+	}
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_route_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_route_table_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_MANGLE,
+		.priv		= &nf_chain_route_output.chain,
+	},
+};
+
+static struct nft_table nf_table_route_ipv4 __read_mostly = {
+	.name	= "route",
+	.chains	= LIST_HEAD_INIT(nf_table_route_ipv4.chains),
+};
+
+static int __init nf_table_route_init(void)
+{
+	list_add_tail(&nf_chain_route_output.chain.list,
+		      &nf_table_route_ipv4.chains);
+	return nft_register_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+}
+
+static void __exit nf_table_route_exit(void)
+{
+	nft_unregister_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+}
+
+module_init(nf_table_route_init);
+module_exit(nf_table_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 000000000000..63d0a3bf53d3
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/ip.h>
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+				    struct sk_buff *skb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    int (*okfn)(struct sk_buff *))
+{
+	if (unlikely(skb->len < sizeof(struct iphdr) ||
+		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+		if (net_ratelimit())
+			pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+				"packet\n");
+		return NF_ACCEPT;
+	}
+
+	return nft_do_chain(ops, skb, in, out, okfn);
+}
+
+static struct nft_af_info nft_af_ipv4 __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.nhooks		= NF_INET_NUMHOOKS,
+	.owner		= THIS_MODULE,
+	.hooks		= {
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+	},
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+	return nft_register_afinfo(&nft_af_ipv4);
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 000000000000..b4ee8d3bb1e4
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/icmp.h>
+
+struct nft_reject {
+	enum nft_reject_types	type:8;
+	u8			icmp_code;
+};
+
+static void nft_reject_eval(const struct nft_expr *expr,
+			      struct nft_data data[NFT_REG_MAX + 1],
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_reject *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0);
+		break;
+	case NFT_REJECT_TCP_RST:
+		break;
+	}
+
+	data[NFT_REG_VERDICT].verdict = NF_DROP;
+}
+
+static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
+	[NFTA_REJECT_TYPE]		= { .type = NLA_U32 },
+	[NFTA_REJECT_ICMP_CODE]		= { .type = NLA_U8 },
+};
+
+static int nft_reject_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_reject *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_REJECT_TYPE] == NULL)
+		return -EINVAL;
+
+	priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+			return -EINVAL;
+		priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+	case NFT_REJECT_TCP_RST:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_reject *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type))
+		goto nla_put_failure;
+
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+			goto nla_put_failure;
+		break;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops reject_ops __read_mostly = {
+	.name		= "reject",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_reject_eval,
+	.init		= nft_reject_init,
+	.dump		= nft_reject_dump,
+	.policy		= nft_reject_policy,
+	.maxattr	= NFTA_REJECT_MAX,
+};
+
+static int __init nft_reject_module_init(void)
+{
+	return nft_register_expr(&reject_ops);
+}
+
+static void __exit nft_reject_module_exit(void)
+{
+	nft_unregister_expr(&reject_ops);
+}
+
+module_init(nft_reject_module_init);
+module_exit(nft_reject_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("reject");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index a7f842b29b67..5677e38eeca3 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,14 @@ config NF_CONNTRACK_IPV6
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_TABLES_IPV6
+	depends on NF_TABLES
+	tristate "IPv6 nf_tables support"
+
+config NF_TABLE_ROUTE_IPV6
+	depends on NF_TABLES_IPV6
+	tristate "IPv6 nf_tables route table support"
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2b53738f798c..956af4492d10 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -23,6 +23,10 @@ obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
 nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
+# nf_tables
+obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
+obj-$(CONFIG_NF_TABLE_ROUTE_IPV6) += nf_table_route_ipv6.o
+
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
diff --git a/net/ipv6/netfilter/nf_table_route_ipv6.c b/net/ipv6/netfilter/nf_table_route_ipv6.c
new file mode 100644
index 000000000000..48ac65c7b398
--- /dev/null
+++ b/net/ipv6/netfilter/nf_table_route_ipv6.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	struct in6_addr saddr, daddr;
+	u_int8_t hop_limit;
+	u32 mark, flowlabel;
+
+	/* save source/dest address, mark, hoplimit, flowlabel, priority */
+	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+	mark = skb->mark;
+	hop_limit = ipv6_hdr(skb)->hop_limit;
+
+	/* flowlabel and prio (includes version, which shouldn't change either */
+	flowlabel = *((u32 *)ipv6_hdr(skb));
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE &&
+	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+	     skb->mark != mark ||
+	     ipv6_hdr(skb)->hop_limit != hop_limit ||
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_route_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_route_table_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_MANGLE,
+		.priv		= &nf_chain_route_output.chain,
+	},
+};
+
+static struct nft_table nf_table_route_ipv6 __read_mostly = {
+	.name	= "route",
+	.chains	= LIST_HEAD_INIT(nf_table_route_ipv6.chains),
+};
+
+static int __init nf_table_route_init(void)
+{
+	list_add_tail(&nf_chain_route_output.chain.list,
+		      &nf_table_route_ipv6.chains);
+	return nft_register_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+}
+
+static void __exit nf_table_route_exit(void)
+{
+	nft_unregister_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+}
+
+module_init(nf_table_route_init);
+module_exit(nf_table_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
new file mode 100644
index 000000000000..e0717cea4913
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
+				    struct sk_buff *skb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    int (*okfn)(struct sk_buff *))
+{
+	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
+		if (net_ratelimit())
+			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
+				"packet\n");
+		return NF_ACCEPT;
+	}
+
+	return nft_do_chain(ops, skb, in, out, okfn);
+}
+
+static struct nft_af_info nft_af_ipv6 __read_mostly = {
+	.family		= NFPROTO_IPV6,
+	.nhooks		= NF_INET_NUMHOOKS,
+	.owner		= THIS_MODULE,
+	.hooks		= {
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+	},
+};
+
+static int __init nf_tables_ipv6_init(void)
+{
+	return nft_register_afinfo(&nft_af_ipv6);
+}
+
+static void __exit nf_tables_ipv6_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_ipv6);
+}
+
+module_init(nf_tables_ipv6_init);
+module_exit(nf_tables_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET6);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 6e839b6dff2b..c271e1af93b5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -413,6 +413,43 @@ config NETFILTER_SYNPROXY
 
 endif # NF_CONNTRACK
 
+config NF_TABLES
+	depends on NETFILTER_NETLINK
+	tristate "Netfilter nf_tables support"
+
+config NFT_EXTHDR
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables IPv6 exthdr module"
+
+config NFT_META
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables meta module"
+
+config NFT_CT
+	depends on NF_TABLES
+	depends on NF_CONNTRACK
+	tristate "Netfilter nf_tables conntrack module"
+
+config NFT_SET
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables set module"
+
+config NFT_HASH
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables hash module"
+
+config NFT_COUNTER
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables counter module"
+
+config NFT_LOG
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables log module"
+
+config NFT_LIMIT
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables limit module"
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c3a0a12907f6..1ca3f3932826 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -64,6 +64,22 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # SYNPROXY
 obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
 
+# nf_tables
+nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
+nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+
+obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
+obj-$(CONFIG_NFT_META)		+= nft_meta.o
+obj-$(CONFIG_NFT_CT)		+= nft_ct.o
+obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
+#nf_tables-objs			+= nft_meta_target.o
+obj-$(CONFIG_NFT_SET)		+= nft_set.o
+obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
+obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
+obj-$(CONFIG_NFT_LOG)		+= nft_log.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
new file mode 100644
index 000000000000..7d59c89c6c75
--- /dev/null
+++ b/net/netfilter/nf_tables_api.c
@@ -0,0 +1,1760 @@
+/*
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/sock.h>
+
+static LIST_HEAD(nf_tables_afinfo);
+static LIST_HEAD(nf_tables_expressions);
+
+/**
+ *	nft_register_afinfo - register nf_tables address family info
+ *
+ *	@afi: address family info to register
+ *
+ *	Register the address family for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_afinfo(struct nft_af_info *afi)
+{
+	INIT_LIST_HEAD(&afi->tables);
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail(&afi->list, &nf_tables_afinfo);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_afinfo);
+
+/**
+ *	nft_unregister_afinfo - unregister nf_tables address family info
+ *
+ *	@afi: address family info to unregister
+ *
+ *	Unregister the address family for use with nf_tables.
+ */
+void nft_unregister_afinfo(struct nft_af_info *afi)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&afi->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
+
+static struct nft_af_info *nft_afinfo_lookup(int family)
+{
+	struct nft_af_info *afi;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (afi->family == family)
+			return afi;
+	}
+	return NULL;
+}
+
+static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
+{
+	struct nft_af_info *afi;
+
+	afi = nft_afinfo_lookup(family);
+	if (afi != NULL)
+		return afi;
+#ifdef CONFIG_MODULES
+	if (autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-afinfo-%u", family);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		afi = nft_afinfo_lookup(family);
+		if (afi != NULL)
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-EAFNOSUPPORT);
+}
+
+/*
+ * Tables
+ */
+
+static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+					  const struct nlattr *nla)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &afi->tables, list) {
+		if (!nla_strcmp(nla, table->name))
+			return table;
+	}
+	return NULL;
+}
+
+static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+						const struct nlattr *nla,
+						bool autoload)
+{
+	struct nft_table *table;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	table = nft_table_lookup(afi, nla);
+	if (table != NULL)
+		return table;
+
+#ifdef CONFIG_MODULES
+	if (autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-table-%u-%*.s", afi->family,
+			       nla_len(nla)-1, (const char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (nft_table_lookup(afi, nla))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+static inline u64 nf_tables_alloc_handle(struct nft_table *table)
+{
+	return ++table->hgenerator;
+}
+
+static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
+	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
+};
+
+static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
+				     int event, u32 flags, int family,
+				     const struct nft_table *table)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_table_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  int event, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	bool report;
+	int err;
+
+	report = nlh ? nlmsg_report(nlh) : false;
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_table_info(skb, portid, seq, event, 0,
+					family, table);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_tables(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			if (idx < s_idx)
+				goto cont;
+			if (idx > s_idx)
+				memset(&cb->args[1], 0,
+				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (nf_tables_fill_table_info(skb,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      NFT_MSG_NEWTABLE,
+						      NLM_F_MULTI,
+						      afi->family, table) < 0)
+				goto done;
+cont:
+			idx++;
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_tables,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid,
+					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
+					family, table);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nlattr *name;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	name = nla[NFTA_TABLE_NAME];
+	table = nf_tables_table_lookup(afi, name, false);
+	if (IS_ERR(table)) {
+		if (PTR_ERR(table) != -ENOENT)
+			return PTR_ERR(table);
+		table = NULL;
+	}
+
+	if (table != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EOPNOTSUPP;
+		return 0;
+	}
+
+	table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL);
+	if (table == NULL)
+		return -ENOMEM;
+
+	nla_strlcpy(table->name, name, nla_len(name));
+	INIT_LIST_HEAD(&table->chains);
+
+	list_add_tail(&table->list, &afi->tables);
+	nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family);
+	return 0;
+}
+
+static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	if (table->flags & NFT_TABLE_BUILTIN)
+		return -EOPNOTSUPP;
+
+	if (table->use)
+		return -EBUSY;
+
+	list_del(&table->list);
+	nf_tables_table_notify(skb, nlh, table, NFT_MSG_DELTABLE, family);
+	kfree(table);
+	return 0;
+}
+
+static struct nft_table *__nf_tables_table_lookup(const struct nft_af_info *afi,
+						  const char *name)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &afi->tables, list) {
+		if (!strcmp(name, table->name))
+			return table;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_chain_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  const struct nft_chain *chain,
+				  int event, int family);
+
+/**
+ *	nft_register_table - register a built-in table
+ *
+ *	@table: the table to register
+ *	@family: protocol family to register table with
+ *
+ *	Register a built-in table for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_table(struct nft_table *table, int family)
+{
+	struct nft_af_info *afi;
+	struct nft_table *t;
+	struct nft_chain *chain;
+	int err;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+again:
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi)) {
+		err = PTR_ERR(afi);
+		if (err == -EAGAIN)
+			goto again;
+		goto err;
+	}
+
+	t = __nf_tables_table_lookup(afi, table->name);
+	if (IS_ERR(t)) {
+		err = PTR_ERR(t);
+		if (err != -ENOENT)
+			goto err;
+		t = NULL;
+	}
+
+	if (t != NULL) {
+		err = -EEXIST;
+		goto err;
+	}
+
+	table->flags |= NFT_TABLE_BUILTIN;
+	list_add_tail(&table->list, &afi->tables);
+	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
+	list_for_each_entry(chain, &table->chains, list)
+		nf_tables_chain_notify(NULL, NULL, table, chain,
+				       NFT_MSG_NEWCHAIN, family);
+	err = 0;
+err:
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nft_register_table);
+
+/**
+ *	nft_unregister_table - unregister a built-in table
+ *
+ *	@table: the table to unregister
+ *	@family: protocol family to unregister table with
+ *
+ *	Unregister a built-in table for use with nf_tables.
+ */
+void nft_unregister_table(struct nft_table *table, int family)
+{
+	struct nft_chain *chain;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&table->list);
+	list_for_each_entry(chain, &table->chains, list)
+		nf_tables_chain_notify(NULL, NULL, table, chain,
+				       NFT_MSG_DELCHAIN, family);
+	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_DELTABLE, family);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_table);
+
+/*
+ * Chains
+ */
+
+static struct nft_chain *
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+{
+	struct nft_chain *chain;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (chain->handle == handle)
+			return chain;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
+						const struct nlattr *nla)
+{
+	struct nft_chain *chain;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nla_strcmp(nla, chain->name))
+			return chain;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
+	[NFTA_CHAIN_TABLE]	= { .type = NLA_STRING },
+	[NFTA_CHAIN_HANDLE]	= { .type = NLA_U64 },
+	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
+	[NFTA_HOOK_HOOKNUM]	= { .type = NLA_U32 },
+	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 },
+};
+
+static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
+				     int event, u32 flags, int family,
+				     const struct nft_table *table,
+				     const struct nft_chain *chain)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
+		goto nla_put_failure;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		const struct nf_hook_ops *ops = &nft_base_chain(chain)->ops;
+		struct nlattr *nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+		if (nest == NULL)
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+			goto nla_put_failure;
+		nla_nest_end(skb, nest);
+	}
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_chain_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  const struct nft_chain *chain,
+				  int event, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	bool report;
+	int err;
+
+	report = nlh ? nlmsg_report(nlh) : false;
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_chain_info(skb, portid, seq, event, 0, family,
+					table, chain);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_chains(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(chain, &table->chains, list) {
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid,
+							      cb->nlh->nlmsg_seq,
+							      NFT_MSG_NEWCHAIN,
+							      NLM_F_MULTI,
+							      afi->family, table, chain) < 0)
+					goto done;
+cont:
+				idx++;
+			}
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+
+static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_chains,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid,
+					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
+					family, table, chain);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nlattr * uninitialized_var(name);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_base_chain *basechain;
+	struct nlattr *ha[NFTA_HOOK_MAX + 1];
+	int family = nfmsg->nfgen_family;
+	u64 handle = 0;
+	int err;
+	bool create;
+
+	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], create);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	if (table->use == UINT_MAX)
+		return -EOVERFLOW;
+
+	chain = NULL;
+	name = nla[NFTA_CHAIN_NAME];
+
+	if (nla[NFTA_CHAIN_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+		chain = nf_tables_chain_lookup_byhandle(table, handle);
+		if (IS_ERR(chain))
+			return PTR_ERR(chain);
+	} else {
+		chain = nf_tables_chain_lookup(table, name);
+		if (IS_ERR(chain)) {
+			if (PTR_ERR(chain) != -ENOENT)
+				return PTR_ERR(chain);
+			chain = NULL;
+		}
+	}
+
+	if (chain != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EOPNOTSUPP;
+
+		if (nla[NFTA_CHAIN_HANDLE] && name &&
+		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
+			return -EEXIST;
+
+		if (nla[NFTA_CHAIN_HANDLE] && name)
+			nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+		goto notify;
+	}
+
+	if (nla[NFTA_CHAIN_HOOK]) {
+		struct nf_hook_ops *ops;
+
+		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+				       nft_hook_policy);
+		if (err < 0)
+			return err;
+		if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+		    ha[NFTA_HOOK_PRIORITY] == NULL)
+			return -EINVAL;
+		if (ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])) >= afi->nhooks)
+			return -EINVAL;
+
+		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+		if (basechain == NULL)
+			return -ENOMEM;
+		chain = &basechain->chain;
+
+		ops = &basechain->ops;
+		ops->pf		= family;
+		ops->owner	= afi->owner;
+		ops->hooknum	= ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		ops->priority	= ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+		ops->priv	= chain;
+		ops->hook	= nft_do_chain;
+		if (afi->hooks[ops->hooknum])
+			ops->hook = afi->hooks[ops->hooknum];
+
+		chain->flags |= NFT_BASE_CHAIN;
+	} else {
+		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+		if (chain == NULL)
+			return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&chain->rules);
+	chain->handle = nf_tables_alloc_handle(table);
+	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+	list_add_tail(&chain->list, &table->chains);
+	table->use++;
+notify:
+	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
+			       family);
+	return 0;
+}
+
+static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
+{
+	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
+
+	BUG_ON(chain->use > 0);
+
+	if (chain->flags & NFT_BASE_CHAIN)
+		kfree(nft_base_chain(chain));
+	else
+		kfree(chain);
+}
+
+static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (chain->flags & NFT_CHAIN_BUILTIN)
+		return -EOPNOTSUPP;
+
+	if (!list_empty(&chain->rules))
+		return -EBUSY;
+
+	list_del(&chain->list);
+	table->use--;
+
+	if (chain->flags & NFT_BASE_CHAIN)
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+
+	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN,
+			       family);
+
+	/* Make sure all rule references are gone before this is released */
+	call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy);
+	return 0;
+}
+
+static void nft_ctx_init(struct nft_ctx *ctx,
+			 const struct nft_af_info *afi,
+			 const struct nft_table *table,
+			 const struct nft_chain *chain)
+{
+	ctx->afi   = afi;
+	ctx->table = table;
+	ctx->chain = chain;
+}
+
+/*
+ * Expressions
+ */
+
+/**
+ *	nft_register_expr - register nf_tables expr operations
+ *	@ops: expr operations
+ *
+ *	Registers the expr operations for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail(&ops->list, &nf_tables_expressions);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ *	nft_unregister_expr - unregister nf_tables expr operations
+ *	@ops: expr operations
+ *
+ * 	Unregisters the expr operations for use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&ops->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+static const struct nft_expr_ops *__nft_expr_ops_get(struct nlattr *nla)
+{
+	const struct nft_expr_ops *ops;
+
+	list_for_each_entry(ops, &nf_tables_expressions, list) {
+		if (!nla_strcmp(nla, ops->name))
+			return ops;
+	}
+	return NULL;
+}
+
+static const struct nft_expr_ops *nft_expr_ops_get(struct nlattr *nla)
+{
+	const struct nft_expr_ops *ops;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	ops = __nft_expr_ops_get(nla);
+	if (ops != NULL && try_module_get(ops->owner))
+		return ops;
+
+#ifdef CONFIG_MODULES
+	if (ops == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-expr-%.*s",
+			       nla_len(nla), (char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_expr_ops_get(nla))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+	[NFTA_EXPR_NAME]	= { .type = NLA_STRING },
+	[NFTA_EXPR_DATA]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_expr_info(struct sk_buff *skb,
+				    const struct nft_expr *expr)
+{
+	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->name))
+		goto nla_put_failure;
+
+	if (expr->ops->dump) {
+		struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+		if (data == NULL)
+			goto nla_put_failure;
+		if (expr->ops->dump(skb, expr) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, data);
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+};
+
+struct nft_expr_info {
+	const struct nft_expr_ops	*ops;
+	struct nlattr			*tb[NFTA_EXPR_MAX + 1];
+};
+
+static int nf_tables_expr_parse(const struct nlattr *nla,
+				struct nft_expr_info *info)
+{
+	const struct nft_expr_ops *ops;
+	int err;
+
+	err = nla_parse_nested(info->tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+	if (err < 0)
+		return err;
+
+	ops = nft_expr_ops_get(info->tb[NFTA_EXPR_NAME]);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	info->ops = ops;
+	return 0;
+}
+
+static int nf_tables_newexpr(const struct nft_ctx *ctx,
+			     struct nft_expr_info *info,
+			     struct nft_expr *expr)
+{
+	const struct nft_expr_ops *ops = info->ops;
+	int err;
+
+	expr->ops = ops;
+	if (ops->init) {
+		struct nlattr *ma[ops->maxattr + 1];
+
+		if (info->tb[NFTA_EXPR_DATA]) {
+			err = nla_parse_nested(ma, ops->maxattr,
+					       info->tb[NFTA_EXPR_DATA],
+					       ops->policy);
+			if (err < 0)
+				goto err1;
+		} else
+			memset(ma, 0, sizeof(ma[0]) * (ops->maxattr + 1));
+
+		err = ops->init(ctx, expr, (const struct nlattr **)ma);
+		if (err < 0)
+			goto err1;
+	}
+
+	info->ops = NULL;
+	return 0;
+
+err1:
+	expr->ops = NULL;
+	return err;
+}
+
+static void nf_tables_expr_destroy(struct nft_expr *expr)
+{
+	if (expr->ops->destroy)
+		expr->ops->destroy(expr);
+	module_put(expr->ops->owner);
+}
+
+/*
+ * Rules
+ */
+
+static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
+						u64 handle)
+{
+	struct nft_rule *rule;
+
+	// FIXME: this sucks
+	list_for_each_entry(rule, &chain->rules, list) {
+		if (handle == rule->handle)
+			return rule;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
+					      const struct nlattr *nla)
+{
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+}
+
+static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
+	[NFTA_RULE_TABLE]	= { .type = NLA_STRING },
+	[NFTA_RULE_CHAIN]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
+	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
+				    int event, u32 flags, int family,
+				    const struct nft_table *table,
+				    const struct nft_chain *chain,
+				    const struct nft_rule *rule)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	const struct nft_expr *expr, *next;
+	struct nlattr *list;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+			flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+		goto nla_put_failure;
+
+	list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+	if (list == NULL)
+		goto nla_put_failure;
+	nft_rule_for_each_expr(expr, next, rule) {
+		struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
+		if (elem == NULL)
+			goto nla_put_failure;
+		if (nf_tables_fill_expr_info(skb, expr) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, elem);
+	}
+	nla_nest_end(skb, list);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_rule_notify(const struct sk_buff *oskb,
+				 const struct nlmsghdr *nlh,
+				 const struct nft_table *table,
+				 const struct nft_chain *chain,
+				 const struct nft_rule *rule,
+				 int event, u32 flags, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = NETLINK_CB(oskb).portid;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	u32 seq = nlh->nlmsg_seq;
+	bool report;
+	int err;
+
+	report = nlmsg_report(nlh);
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_rule_info(skb, portid, seq, event, flags,
+				       family, table, chain, rule);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_rules(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	const struct nft_rule *rule;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(chain, &table->chains, list) {
+				list_for_each_entry(rule, &chain->rules, list) {
+					if (idx < s_idx)
+						goto cont;
+					if (idx > s_idx)
+						memset(&cb->args[1], 0,
+						       sizeof(cb->args) - sizeof(cb->args[0]));
+					if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid,
+								      cb->nlh->nlmsg_seq,
+								      NFT_MSG_NEWRULE,
+								      NLM_F_MULTI | NLM_F_APPEND,
+								      afi->family, table, chain, rule) < 0)
+						goto done;
+cont:
+					idx++;
+				}
+			}
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	const struct nft_rule *rule;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_rules,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+	if (IS_ERR(rule))
+		return PTR_ERR(rule);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid,
+				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+				       family, table, chain, rule);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static void nf_tables_rcu_rule_destroy(struct rcu_head *head)
+{
+	struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head);
+	struct nft_expr *expr;
+
+	/*
+	 * Careful: some expressions might not be initialized in case this
+	 * is called on error from nf_tables_newrule().
+	 */
+	expr = nft_expr_first(rule);
+	while (expr->ops && expr != nft_expr_last(rule)) {
+		nf_tables_expr_destroy(expr);
+		expr = nft_expr_next(expr);
+	}
+	kfree(rule);
+}
+
+static void nf_tables_rule_destroy(struct nft_rule *rule)
+{
+	call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy);
+}
+
+#define NFT_RULE_MAXEXPRS	128
+
+static struct nft_expr_info *info;
+
+static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_rule *rule, *old_rule = NULL;
+	struct nft_expr *expr;
+	struct nft_ctx ctx;
+	struct nlattr *tmp;
+	unsigned int size, i, n;
+	int err, rem;
+	bool create;
+	u64 handle;
+
+	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], create);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (nla[NFTA_RULE_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
+		rule = __nf_tables_rule_lookup(chain, handle);
+		if (IS_ERR(rule))
+			return PTR_ERR(rule);
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			old_rule = rule;
+		else
+			return -EOPNOTSUPP;
+	} else {
+		if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EINVAL;
+		handle = nf_tables_alloc_handle(table);
+	}
+
+	n = 0;
+	size = 0;
+	if (nla[NFTA_RULE_EXPRESSIONS]) {
+		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
+			err = -EINVAL;
+			if (nla_type(tmp) != NFTA_LIST_ELEM)
+				goto err1;
+			if (n == NFT_RULE_MAXEXPRS)
+				goto err1;
+			err = nf_tables_expr_parse(tmp, &info[n]);
+			if (err < 0)
+				goto err1;
+			size += info[n].ops->size;
+			n++;
+		}
+	}
+
+	err = -ENOMEM;
+	rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL);
+	if (rule == NULL)
+		goto err1;
+
+	rule->handle = handle;
+	rule->dlen   = size;
+
+	nft_ctx_init(&ctx, afi, table, chain);
+	expr = nft_expr_first(rule);
+	for (i = 0; i < n; i++) {
+		err = nf_tables_newexpr(&ctx, &info[i], expr);
+		if (err < 0)
+			goto err2;
+		expr = nft_expr_next(expr);
+	}
+
+	/* Register hook when first rule is inserted into a base chain */
+	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0)
+			goto err2;
+	}
+
+	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+		list_replace_rcu(&old_rule->list, &rule->list);
+		nf_tables_rule_destroy(old_rule);
+	} else if (nlh->nlmsg_flags & NLM_F_APPEND)
+		list_add_tail_rcu(&rule->list, &chain->rules);
+	else
+		list_add_rcu(&rule->list, &chain->rules);
+
+	nf_tables_rule_notify(skb, nlh, table, chain, rule, NFT_MSG_NEWRULE,
+			      nlh->nlmsg_flags & (NLM_F_APPEND | NLM_F_REPLACE),
+			      nfmsg->nfgen_family);
+	return 0;
+
+err2:
+	nf_tables_rule_destroy(rule);
+err1:
+	for (i = 0; i < n; i++) {
+		if (info[i].ops != NULL)
+			module_put(info[i].ops->owner);
+	}
+	return err;
+}
+
+static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_rule *rule, *tmp;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (nla[NFTA_RULE_HANDLE]) {
+		rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+		if (IS_ERR(rule))
+			return PTR_ERR(rule);
+
+		/* List removal must be visible before destroying expressions */
+		list_del_rcu(&rule->list);
+
+		nf_tables_rule_notify(skb, nlh, table, chain, rule,
+				      NFT_MSG_DELRULE, 0, family);
+		nf_tables_rule_destroy(rule);
+	} else {
+		/* Remove all rules in this chain */
+		list_for_each_entry_safe(rule, tmp, &chain->rules, list) {
+			list_del_rcu(&rule->list);
+
+			nf_tables_rule_notify(skb, nlh, table, chain, rule,
+					      NFT_MSG_DELRULE, 0, family);
+			nf_tables_rule_destroy(rule);
+		}
+	}
+
+	/* Unregister hook when last rule from base chain is deleted */
+	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN)
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+
+	return 0;
+}
+
+static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
+	[NFT_MSG_NEWTABLE] = {
+		.call		= nf_tables_newtable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_GETTABLE] = {
+		.call		= nf_tables_gettable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_DELTABLE] = {
+		.call		= nf_tables_deltable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_NEWCHAIN] = {
+		.call		= nf_tables_newchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_GETCHAIN] = {
+		.call		= nf_tables_getchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_DELCHAIN] = {
+		.call		= nf_tables_delchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_NEWRULE] = {
+		.call		= nf_tables_newrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+	[NFT_MSG_GETRULE] = {
+		.call		= nf_tables_getrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+	[NFT_MSG_DELRULE] = {
+		.call		= nf_tables_delrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+};
+
+static const struct nfnetlink_subsystem nf_tables_subsys = {
+	.name		= "nf_tables",
+	.subsys_id	= NFNL_SUBSYS_NFTABLES,
+	.cb_count	= NFT_MSG_MAX,
+	.cb		= nf_tables_cb,
+};
+
+/**
+ *	nft_validate_input_register - validate an expressions' input register
+ *
+ *	@reg: the register number
+ *
+ * 	Validate that the input register is one of the general purpose
+ * 	registers.
+ */
+int nft_validate_input_register(enum nft_registers reg)
+{
+	if (reg <= NFT_REG_VERDICT)
+		return -EINVAL;
+	if (reg > NFT_REG_MAX)
+		return -ERANGE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_input_register);
+
+/**
+ *	nft_validate_output_register - validate an expressions' output register
+ *
+ *	@reg: the register number
+ *
+ * 	Validate that the output register is one of the general purpose
+ * 	registers or the verdict register.
+ */
+int nft_validate_output_register(enum nft_registers reg)
+{
+	if (reg < NFT_REG_VERDICT)
+		return -EINVAL;
+	if (reg > NFT_REG_MAX)
+		return -ERANGE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_output_register);
+
+/**
+ *	nft_validate_data_load - validate an expressions' data load
+ *
+ *	@ctx: context of the expression performing the load
+ * 	@reg: the destination register number
+ * 	@data: the data to load
+ * 	@type: the data type
+ *
+ * 	Validate that a data load uses the appropriate data type for
+ * 	the destination register. A value of NULL for the data means
+ * 	that its runtime gathered data, which is always of type
+ * 	NFT_DATA_VALUE.
+ */
+int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
+			   const struct nft_data *data,
+			   enum nft_data_types type)
+{
+	switch (reg) {
+	case NFT_REG_VERDICT:
+		if (data == NULL || type != NFT_DATA_VERDICT)
+			return -EINVAL;
+		// FIXME: do loop detection
+		return 0;
+	default:
+		if (data != NULL && type != NFT_DATA_VALUE)
+			return -EINVAL;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(nft_validate_data_load);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+	[NFTA_VERDICT_CODE]	= { .type = NLA_U32 },
+	[NFTA_VERDICT_CHAIN]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+};
+
+static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
+			    struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+	struct nft_chain *chain;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_VERDICT_CODE])
+		return -EINVAL;
+	data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+
+	switch (data->verdict) {
+	case NF_ACCEPT:
+	case NF_DROP:
+	case NF_QUEUE:
+	case NFT_CONTINUE:
+	case NFT_BREAK:
+	case NFT_RETURN:
+		desc->len = sizeof(data->verdict);
+		break;
+	case NFT_JUMP:
+	case NFT_GOTO:
+		if (!tb[NFTA_VERDICT_CHAIN])
+			return -EINVAL;
+		chain = nf_tables_chain_lookup(ctx->table,
+					       tb[NFTA_VERDICT_CHAIN]);
+		if (IS_ERR(chain))
+			return PTR_ERR(chain);
+		if (chain->flags & NFT_BASE_CHAIN)
+			return -EOPNOTSUPP;
+
+		if (ctx->chain->level + 1 > chain->level) {
+			if (ctx->chain->level + 1 == 16)
+				return -EMLINK;
+			chain->level = ctx->chain->level + 1;
+		}
+		chain->use++;
+		data->chain = chain;
+		desc->len = sizeof(data);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	desc->type = NFT_DATA_VERDICT;
+	return 0;
+}
+
+static void nft_verdict_uninit(const struct nft_data *data)
+{
+	switch (data->verdict) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		data->chain->use--;
+		break;
+	}
+}
+
+static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+	if (!nest)
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict)))
+		goto nla_put_failure;
+
+	switch (data->verdict) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+			  struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	unsigned int len;
+
+	len = nla_len(nla);
+	if (len == 0)
+		return -EINVAL;
+	if (len > sizeof(data->data))
+		return -EOVERFLOW;
+
+	nla_memcpy(data->data, nla, sizeof(data->data));
+	desc->type = NFT_DATA_VALUE;
+	desc->len  = len;
+	return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+			  unsigned int len)
+{
+	return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+	[NFTA_DATA_VALUE]	= { .type = NLA_BINARY,
+				    .len  = FIELD_SIZEOF(struct nft_data, data) },
+	[NFTA_DATA_VERDICT]	= { .type = NLA_NESTED },
+};
+
+/**
+ *	nft_data_init - parse nf_tables data netlink attributes
+ *
+ *	@ctx: context of the expression using the data
+ *	@data: destination struct nft_data
+ *	@desc: data description
+ *	@nla: netlink attribute containing data
+ *
+ *	Parse the netlink data attributes and initialize a struct nft_data.
+ *	The type and length of data are returned in the data description.
+ *
+ *	The caller can indicate that it only wants to accept data of type
+ *	NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+		  struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	struct nlattr *tb[NFTA_DATA_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DATA_VALUE])
+		return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+	if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
+		return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+/**
+ *	nft_data_uninit - release a nft_data item
+ *
+ *	@data: struct nft_data to release
+ *	@type: type of data
+ *
+ *	Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ *	all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+{
+	switch (type) {
+	case NFT_DATA_VALUE:
+		return;
+	case NFT_DATA_VERDICT:
+		return nft_verdict_uninit(data);
+	default:
+		WARN_ON(1);
+	}
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+		  enum nft_data_types type, unsigned int len)
+{
+	struct nlattr *nest;
+	int err;
+
+	nest = nla_nest_start(skb, attr);
+	if (nest == NULL)
+		return -1;
+
+	switch (type) {
+	case NFT_DATA_VALUE:
+		err = nft_value_dump(skb, data, len);
+		break;
+	case NFT_DATA_VERDICT:
+		err = nft_verdict_dump(skb, data);
+		break;
+	default:
+		err = -EINVAL;
+		WARN_ON(1);
+	}
+
+	nla_nest_end(skb, nest);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+static int __init nf_tables_module_init(void)
+{
+	int err;
+
+	info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
+		       GFP_KERNEL);
+	if (info == NULL) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = nf_tables_core_module_init();
+	if (err < 0)
+		goto err2;
+
+	err = nfnetlink_subsys_register(&nf_tables_subsys);
+	if (err < 0)
+		goto err3;
+
+	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
+	return 0;
+err3:
+	nf_tables_core_module_exit();
+err2:
+	kfree(info);
+err1:
+	return err;
+}
+
+static void __exit nf_tables_module_exit(void)
+{
+	nfnetlink_subsys_unregister(&nf_tables_subsys);
+	nf_tables_core_module_exit();
+	kfree(info);
+}
+
+module_init(nf_tables_module_init);
+module_exit(nf_tables_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 000000000000..bc7fb85d4002
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+#define NFT_JUMP_STACK_SIZE	16
+
+unsigned int nft_do_chain(const struct nf_hook_ops *ops,
+			  struct sk_buff *skb,
+			  const struct net_device *in,
+			  const struct net_device *out,
+			  int (*okfn)(struct sk_buff *))
+{
+	const struct nft_chain *chain = ops->priv;
+	const struct nft_rule *rule;
+	const struct nft_expr *expr, *last;
+	struct nft_data data[NFT_REG_MAX + 1];
+	const struct nft_pktinfo pkt = {
+		.skb		= skb,
+		.in		= in,
+		.out		= out,
+		.hooknum	= ops->hooknum,
+	};
+	unsigned int stackptr = 0;
+	struct {
+		const struct nft_chain	*chain;
+		const struct nft_rule	*rule;
+	} jumpstack[NFT_JUMP_STACK_SIZE];
+
+do_chain:
+	rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+	data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+	list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+		nft_rule_for_each_expr(expr, last, rule) {
+			expr->ops->eval(expr, data, &pkt);
+			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
+				break;
+		}
+
+		switch (data[NFT_REG_VERDICT].verdict) {
+		case NFT_BREAK:
+			data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+			/* fall through */
+		case NFT_CONTINUE:
+			continue;
+		}
+		break;
+	}
+
+	switch (data[NFT_REG_VERDICT].verdict) {
+	case NF_ACCEPT:
+	case NF_DROP:
+	case NF_QUEUE:
+		return data[NFT_REG_VERDICT].verdict;
+	case NFT_JUMP:
+		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+		jumpstack[stackptr].chain = chain;
+		jumpstack[stackptr].rule  = rule;
+		stackptr++;
+		/* fall through */
+	case NFT_GOTO:
+		chain = data[NFT_REG_VERDICT].chain;
+		goto do_chain;
+	case NFT_RETURN:
+	case NFT_CONTINUE:
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	if (stackptr > 0) {
+		stackptr--;
+		chain = jumpstack[stackptr].chain;
+		rule  = jumpstack[stackptr].rule;
+		goto next_rule;
+	}
+
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+	int err;
+
+	err = nft_immediate_module_init();
+	if (err < 0)
+		goto err1;
+
+	err = nft_cmp_module_init();
+	if (err < 0)
+		goto err2;
+
+	err = nft_lookup_module_init();
+	if (err < 0)
+		goto err3;
+
+	err = nft_bitwise_module_init();
+	if (err < 0)
+		goto err4;
+
+	err = nft_byteorder_module_init();
+	if (err < 0)
+		goto err5;
+
+	err = nft_payload_module_init();
+	if (err < 0)
+		goto err6;
+
+	return 0;
+
+err6:
+	nft_byteorder_module_exit();
+err5:
+	nft_bitwise_module_exit();
+err4:
+	nft_lookup_module_exit();
+err3:
+	nft_cmp_module_exit();
+err2:
+	nft_immediate_module_exit();
+err1:
+	return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+	nft_payload_module_exit();
+	nft_byteorder_module_exit();
+	nft_bitwise_module_exit();
+	nft_lookup_module_exit();
+	nft_cmp_module_exit();
+	nft_immediate_module_exit();
+}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
new file mode 100644
index 000000000000..0f7501506367
--- /dev/null
+++ b/net/netfilter/nft_bitwise.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitwise {
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			len;
+	struct nft_data		mask;
+	struct nft_data		xor;
+};
+
+static void nft_bitwise_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	const struct nft_bitwise *priv = nft_expr_priv(expr);
+	const struct nft_data *src = &data[priv->sreg];
+	struct nft_data *dst = &data[priv->dreg];
+	unsigned int i;
+
+	for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) {
+		dst->data[i] = (src->data[i] & priv->mask.data[i]) ^
+			       priv->xor.data[i];
+	}
+}
+
+static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
+	[NFTA_BITWISE_SREG]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_DREG]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_LEN]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_MASK]	= { .type = NLA_NESTED },
+	[NFTA_BITWISE_XOR]	= { .type = NLA_NESTED },
+};
+
+static int nft_bitwise_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_bitwise *priv = nft_expr_priv(expr);
+	struct nft_data_desc d1, d2;
+	int err;
+
+	if (tb[NFTA_BITWISE_SREG] == NULL ||
+	    tb[NFTA_BITWISE_DREG] == NULL ||
+	    tb[NFTA_BITWISE_LEN] == NULL ||
+	    tb[NFTA_BITWISE_MASK] == NULL ||
+	    tb[NFTA_BITWISE_XOR] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		return err;
+
+	priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+
+	err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+	if (err < 0)
+		return err;
+	if (d1.len != priv->len)
+		return -EINVAL;
+
+	err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+	if (err < 0)
+		return err;
+	if (d2.len != priv->len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_bitwise *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_bitwise_ops __read_mostly = {
+	.name		= "bitwise",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_bitwise_eval,
+	.init		= nft_bitwise_init,
+	.dump		= nft_bitwise_dump,
+	.policy		= nft_bitwise_policy,
+	.maxattr	= NFTA_BITWISE_MAX,
+};
+
+int __init nft_bitwise_module_init(void)
+{
+	return nft_register_expr(&nft_bitwise_ops);
+}
+
+void nft_bitwise_module_exit(void)
+{
+	nft_unregister_expr(&nft_bitwise_ops);
+}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
new file mode 100644
index 000000000000..8b0657a4d17b
--- /dev/null
+++ b/net/netfilter/nft_byteorder.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_byteorder {
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	enum nft_byteorder_ops	op:8;
+	u8			len;
+	u8			size;
+};
+
+static void nft_byteorder_eval(const struct nft_expr *expr,
+			       struct nft_data data[NFT_REG_MAX + 1],
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_byteorder *priv = nft_expr_priv(expr);
+	struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg];
+	union { u32 u32; u16 u16; } *s, *d;
+	unsigned int i;
+
+	s = (void *)src->data;
+	d = (void *)dst->data;
+
+	switch (priv->size) {
+	case 4:
+		switch (priv->op) {
+		case NFT_BYTEORDER_NTOH:
+			for (i = 0; i < priv->len / 4; i++)
+				d[i].u32 = ntohl((__force __be32)s[i].u32);
+			break;
+		case NFT_BYTEORDER_HTON:
+			for (i = 0; i < priv->len / 4; i++)
+				d[i].u32 = (__force __u32)htonl(s[i].u32);
+			break;
+		}
+		break;
+	case 2:
+		switch (priv->op) {
+		case NFT_BYTEORDER_NTOH:
+			for (i = 0; i < priv->len / 2; i++)
+				d[i].u16 = ntohs((__force __be16)s[i].u16);
+			break;
+		case NFT_BYTEORDER_HTON:
+			for (i = 0; i < priv->len / 2; i++)
+				d[i].u16 = (__force __u16)htons(s[i].u16);
+			break;
+		}
+		break;
+	}
+}
+
+static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
+	[NFTA_BYTEORDER_SREG]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_DREG]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_OP]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_LEN]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_SIZE]	= { .type = NLA_U32 },
+};
+
+static int nft_byteorder_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_byteorder *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_BYTEORDER_SREG] == NULL ||
+	    tb[NFTA_BYTEORDER_DREG] == NULL ||
+	    tb[NFTA_BYTEORDER_LEN] == NULL ||
+	    tb[NFTA_BYTEORDER_SIZE] == NULL ||
+	    tb[NFTA_BYTEORDER_OP] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		return err;
+
+	priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP]));
+	switch (priv->op) {
+	case NFT_BYTEORDER_NTOH:
+	case NFT_BYTEORDER_HTON:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+	if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+	switch (priv->size) {
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_byteorder *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_byteorder_ops __read_mostly = {
+	.name		= "byteorder",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_byteorder_eval,
+	.init		= nft_byteorder_init,
+	.dump		= nft_byteorder_dump,
+	.policy		= nft_byteorder_policy,
+	.maxattr	= NFTA_BYTEORDER_MAX,
+};
+
+int __init nft_byteorder_module_init(void)
+{
+	return nft_register_expr(&nft_byteorder_ops);
+}
+
+void nft_byteorder_module_exit(void)
+{
+	nft_unregister_expr(&nft_byteorder_ops);
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
new file mode 100644
index 000000000000..e734d670120a
--- /dev/null
+++ b/net/netfilter/nft_cmp.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_cmp_expr {
+	struct nft_data		data;
+	enum nft_registers	sreg:8;
+	u8			len;
+	enum nft_cmp_ops	op:8;
+};
+
+static void nft_cmp_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+	int d;
+
+	d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len);
+	switch (priv->op) {
+	case NFT_CMP_EQ:
+		if (d != 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_NEQ:
+		if (d == 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_LT:
+		if (d == 0)
+			goto mismatch;
+	case NFT_CMP_LTE:
+		if (d > 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_GT:
+		if (d == 0)
+			goto mismatch;
+	case NFT_CMP_GTE:
+		if (d < 0)
+			goto mismatch;
+		break;
+	}
+	return;
+
+mismatch:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = {
+	[NFTA_CMP_SREG]		= { .type = NLA_U32 },
+	[NFTA_CMP_OP]		= { .type = NLA_U32 },
+	[NFTA_CMP_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_cmp_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc;
+	int err;
+
+	if (tb[NFTA_CMP_SREG] == NULL ||
+	    tb[NFTA_CMP_OP] == NULL ||
+	    tb[NFTA_CMP_DATA] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+	switch (priv->op) {
+	case NFT_CMP_EQ:
+	case NFT_CMP_NEQ:
+	case NFT_CMP_LT:
+	case NFT_CMP_LTE:
+	case NFT_CMP_GT:
+	case NFT_CMP_GTE:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+	if (err < 0)
+		return err;
+
+	priv->len = desc.len;
+	return 0;
+}
+
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_cmp_ops __read_mostly = {
+	.name		= "cmp",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_cmp_eval,
+	.init		= nft_cmp_init,
+	.dump		= nft_cmp_dump,
+	.policy		= nft_cmp_policy,
+	.maxattr	= NFTA_CMP_MAX,
+};
+
+int __init nft_cmp_module_init(void)
+{
+	return nft_register_expr(&nft_cmp_ops);
+}
+
+void nft_cmp_module_exit(void)
+{
+	nft_unregister_expr(&nft_cmp_ops);
+}
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
new file mode 100644
index 000000000000..33c5d36819bb
--- /dev/null
+++ b/net/netfilter/nft_counter.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_counter {
+	seqlock_t	lock;
+	u64		bytes;
+	u64		packets;
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+
+	write_seqlock_bh(&priv->lock);
+	priv->bytes += pkt->skb->len;
+	priv->packets++;
+	write_sequnlock_bh(&priv->lock);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+	unsigned int seq;
+	u64 bytes;
+	u64 packets;
+
+	do {
+		seq = read_seqbegin(&priv->lock);
+		bytes	= priv->bytes;
+		packets	= priv->packets;
+	} while (read_seqretry(&priv->lock, seq));
+
+	if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 },
+	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
+};
+
+static int nft_counter_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_COUNTER_PACKETS])
+	        priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+	if (tb[NFTA_COUNTER_BYTES])
+		priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+
+	seqlock_init(&priv->lock);
+	return 0;
+}
+
+static struct nft_expr_ops nft_counter_ops __read_mostly = {
+	.name		= "counter",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+	.policy		= nft_counter_policy,
+	.maxattr	= NFTA_COUNTER_MAX,
+	.owner		= THIS_MODULE,
+	.eval		= nft_counter_eval,
+	.init		= nft_counter_init,
+	.dump		= nft_counter_dump,
+};
+
+static int __init nft_counter_module_init(void)
+{
+	return nft_register_expr(&nft_counter_ops);
+}
+
+static void __exit nft_counter_module_exit(void)
+{
+	nft_unregister_expr(&nft_counter_ops);
+}
+
+module_init(nft_counter_module_init);
+module_exit(nft_counter_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("counter");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
new file mode 100644
index 000000000000..a1756d678226
--- /dev/null
+++ b/net/netfilter/nft_ct.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+
+struct nft_ct {
+	enum nft_ct_keys	key:8;
+	enum ip_conntrack_dir	dir:8;
+	enum nft_registers	dreg:8;
+	uint8_t			family;
+};
+
+static void nft_ct_eval(const struct nft_expr *expr,
+			struct nft_data data[NFT_REG_MAX + 1],
+			const struct nft_pktinfo *pkt)
+{
+	const struct nft_ct *priv = nft_expr_priv(expr);
+	struct nft_data *dest = &data[priv->dreg];
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_tuple *tuple;
+	const struct nf_conntrack_helper *helper;
+	long diff;
+	unsigned int state;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+
+	switch (priv->key) {
+	case NFT_CT_STATE:
+		if (ct == NULL)
+			state = NF_CT_STATE_INVALID_BIT;
+		else if (nf_ct_is_untracked(ct))
+			state = NF_CT_STATE_UNTRACKED_BIT;
+		else
+			state = NF_CT_STATE_BIT(ctinfo);
+		dest->data[0] = state;
+		return;
+	}
+
+	if (ct == NULL)
+		goto err;
+
+	switch (priv->key) {
+	case NFT_CT_DIRECTION:
+		dest->data[0] = CTINFO2DIR(ctinfo);
+		return;
+	case NFT_CT_STATUS:
+		dest->data[0] = ct->status;
+		return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	case NFT_CT_MARK:
+		dest->data[0] = ct->mark;
+		return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	case NFT_CT_SECMARK:
+		dest->data[0] = ct->secmark;
+		return;
+#endif
+	case NFT_CT_EXPIRATION:
+		diff = (long)jiffies - (long)ct->timeout.expires;
+		if (diff < 0)
+			diff = 0;
+		dest->data[0] = jiffies_to_msecs(diff);
+		return;
+	case NFT_CT_HELPER:
+		if (ct->master == NULL)
+			goto err;
+		help = nfct_help(ct->master);
+		if (help == NULL)
+			goto err;
+		helper = rcu_dereference(help->helper);
+		if (helper == NULL)
+			goto err;
+		if (strlen(helper->name) >= sizeof(dest->data))
+			goto err;
+		strncpy((char *)dest->data, helper->name, sizeof(dest->data));
+		return;
+	}
+
+	tuple = &ct->tuplehash[priv->dir].tuple;
+	switch (priv->key) {
+	case NFT_CT_L3PROTOCOL:
+		dest->data[0] = nf_ct_l3num(ct);
+		return;
+	case NFT_CT_SRC:
+		memcpy(dest->data, tuple->src.u3.all,
+		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+		return;
+	case NFT_CT_DST:
+		memcpy(dest->data, tuple->dst.u3.all,
+		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+		return;
+	case NFT_CT_PROTOCOL:
+		dest->data[0] = nf_ct_protonum(ct);
+		return;
+	case NFT_CT_PROTO_SRC:
+		dest->data[0] = (__force __u16)tuple->src.u.all;
+		return;
+	case NFT_CT_PROTO_DST:
+		dest->data[0] = (__force __u16)tuple->dst.u.all;
+		return;
+	}
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
+	[NFTA_CT_DREG]		= { .type = NLA_U32 },
+	[NFTA_CT_KEY]		= { .type = NLA_U32 },
+	[NFTA_CT_DIRECTION]	= { .type = NLA_U8 },
+};
+
+static int nft_ct_init(const struct nft_ctx *ctx,
+		       const struct nft_expr *expr,
+		       const struct nlattr * const tb[])
+{
+	struct nft_ct *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_CT_DREG] == NULL ||
+	    tb[NFTA_CT_KEY] == NULL)
+		return -EINVAL;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+	if (tb[NFTA_CT_DIRECTION] != NULL) {
+		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+		switch (priv->dir) {
+		case IP_CT_DIR_ORIGINAL:
+		case IP_CT_DIR_REPLY:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (priv->key) {
+	case NFT_CT_STATE:
+	case NFT_CT_DIRECTION:
+	case NFT_CT_STATUS:
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	case NFT_CT_MARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	case NFT_CT_SECMARK:
+#endif
+	case NFT_CT_EXPIRATION:
+	case NFT_CT_HELPER:
+		if (tb[NFTA_CT_DIRECTION] != NULL)
+			return -EINVAL;
+		break;
+	case NFT_CT_PROTOCOL:
+	case NFT_CT_SRC:
+	case NFT_CT_DST:
+	case NFT_CT_PROTO_SRC:
+	case NFT_CT_PROTO_DST:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nf_ct_l3proto_try_module_get(ctx->afi->family);
+	if (err < 0)
+		return err;
+	priv->family = ctx->afi->family;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		goto err1;
+
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		goto err1;
+	return 0;
+
+err1:
+	nf_ct_l3proto_module_put(ctx->afi->family);
+	return err;
+}
+
+static void nft_ct_destroy(const struct nft_expr *expr)
+{
+	struct nft_ct *priv = nft_expr_priv(expr);
+
+	nf_ct_l3proto_module_put(priv->family);
+}
+
+static int nft_ct_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_ct *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_ct_ops __read_mostly = {
+	.name		= "ct",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_ct_eval,
+	.init		= nft_ct_init,
+	.destroy	= nft_ct_destroy,
+	.dump		= nft_ct_dump,
+	.policy		= nft_ct_policy,
+	.maxattr	= NFTA_CT_MAX,
+};
+
+static int __init nft_ct_module_init(void)
+{
+	return nft_register_expr(&nft_ct_ops);
+}
+
+static void __exit nft_ct_module_exit(void)
+{
+	nft_unregister_expr(&nft_ct_ops);
+}
+
+module_init(nft_ct_module_init);
+module_exit(nft_ct_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("ct");
diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c
new file mode 100644
index 000000000000..9fc8eb308193
--- /dev/null
+++ b/net/netfilter/nft_expr_template.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_template {
+
+};
+
+static void nft_template_eval(const struct nft_expr *expr,
+			      struct nft_data data[NFT_REG_MAX + 1],
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = {
+	[NFTA_TEMPLATE_ATTR]		= { .type = NLA_U32 },
+};
+
+static int nft_template_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr *tb[])
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+	return 0;
+}
+
+static void nft_template_destroy(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr)
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_template *priv = nft_expr_priv(expr);
+
+	NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops template_ops __read_mostly = {
+	.name		= "template",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_template)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_template_eval,
+	.init		= nft_template_init,
+	.destroy	= nft_template_destroy,
+	.dump		= nft_template_dump,
+	.policy		= nft_template_policy,
+	.maxattr	= NFTA_TEMPLATE_MAX,
+};
+
+static int __init nft_template_module_init(void)
+{
+	return nft_register_expr(&template_ops);
+}
+
+static void __exit nft_template_module_exit(void)
+{
+	nft_unregister_expr(&template_ops);
+}
+
+module_init(nft_template_module_init);
+module_exit(nft_template_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("template");
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
new file mode 100644
index 000000000000..21c6a6b7b662
--- /dev/null
+++ b/net/netfilter/nft_exthdr.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+// FIXME:
+#include <net/ipv6.h>
+
+struct nft_exthdr {
+	u8			type;
+	u8			offset;
+	u8			len;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_exthdr_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	struct nft_data *dest = &data[priv->dreg];
+	unsigned int offset;
+	int err;
+
+	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
+	if (err < 0)
+		goto err;
+	offset += priv->offset;
+
+	if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0)
+		goto err;
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
+	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
+	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
+};
+
+static int nft_exthdr_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_EXTHDR_DREG] == NULL ||
+	    tb[NFTA_EXTHDR_TYPE] == NULL ||
+	    tb[NFTA_EXTHDR_OFFSET] == NULL ||
+	    tb[NFTA_EXTHDR_LEN] == NULL)
+		return -EINVAL;
+
+	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+	priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+	priv->len    = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+	if (priv->len == 0 ||
+	    priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops exthdr_ops __read_mostly = {
+	.name		= "exthdr",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_exthdr_eval,
+	.init		= nft_exthdr_init,
+	.dump		= nft_exthdr_dump,
+	.policy		= nft_exthdr_policy,
+	.maxattr	= NFTA_EXTHDR_MAX,
+};
+
+static int __init nft_exthdr_module_init(void)
+{
+	return nft_register_expr(&exthdr_ops);
+}
+
+static void __exit nft_exthdr_module_exit(void)
+{
+	nft_unregister_expr(&exthdr_ops);
+}
+
+module_init(nft_exthdr_module_init);
+module_exit(nft_exthdr_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
new file mode 100644
index 000000000000..67cc502881f1
--- /dev/null
+++ b/net/netfilter/nft_hash.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_hash {
+	struct hlist_head	*hash;
+	unsigned int		hsize;
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			klen;
+	u8			dlen;
+	u16			flags;
+};
+
+struct nft_hash_elem {
+	struct hlist_node	hnode;
+	struct nft_data		key;
+	struct nft_data		data[];
+};
+
+static u32 nft_hash_rnd __read_mostly;
+static bool nft_hash_rnd_initted __read_mostly;
+
+static unsigned int nft_hash_data(const struct nft_data *data,
+				  unsigned int hsize, unsigned int len)
+{
+	unsigned int h;
+
+	// FIXME: can we reasonably guarantee the upper bits are fixed?
+	h = jhash2(data->data, len >> 2, nft_hash_rnd);
+	return ((u64)h * hsize) >> 32;
+}
+
+static void nft_hash_eval(const struct nft_expr *expr,
+			  struct nft_data data[NFT_REG_MAX + 1],
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct nft_hash_elem *elem;
+	const struct nft_data *key = &data[priv->sreg];
+	unsigned int h;
+
+	h = nft_hash_data(key, priv->hsize, priv->klen);
+	hlist_for_each_entry(elem, &priv->hash[h], hnode) {
+		if (nft_data_cmp(&elem->key, key, priv->klen))
+			continue;
+		if (priv->flags & NFT_HASH_MAP)
+			nft_data_copy(&data[priv->dreg], elem->data);
+		return;
+	}
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_hash_elem_destroy(const struct nft_expr *expr,
+				  struct nft_hash_elem *elem)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+
+	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
+	if (priv->flags & NFT_HASH_MAP)
+		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
+	kfree(elem);
+}
+
+static const struct nla_policy nft_he_policy[NFTA_HE_MAX + 1] = {
+	[NFTA_HE_KEY]		= { .type = NLA_NESTED },
+	[NFTA_HE_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_hash_elem_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr *nla,
+			      struct nft_hash_elem **new)
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	struct nlattr *tb[NFTA_HE_MAX + 1];
+	struct nft_hash_elem *elem;
+	struct nft_data_desc d1, d2;
+	unsigned int size;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_HE_MAX, nla, nft_he_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_HE_KEY] == NULL)
+		return -EINVAL;
+	size = sizeof(*elem);
+
+	if (priv->flags & NFT_HASH_MAP) {
+		if (tb[NFTA_HE_DATA] == NULL)
+			return -EINVAL;
+		size += sizeof(elem->data[0]);
+	} else {
+		if (tb[NFTA_HE_DATA] != NULL)
+			return -EINVAL;
+	}
+
+	elem = kzalloc(size, GFP_KERNEL);
+	if (elem == NULL)
+		return -ENOMEM;
+
+	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_HE_KEY]);
+	if (err < 0)
+		goto err1;
+	err = -EINVAL;
+	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
+		goto err2;
+
+	if (tb[NFTA_HE_DATA] != NULL) {
+		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_HE_DATA]);
+		if (err < 0)
+			goto err2;
+		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
+		if (err < 0)
+			goto err3;
+	}
+
+	*new = elem;
+	return 0;
+
+err3:
+	nft_data_uninit(elem->data, d2.type);
+err2:
+	nft_data_uninit(&elem->key, d1.type);
+err1:
+	kfree(elem);
+	return err;
+}
+
+static int nft_hash_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			      const struct nft_hash_elem *elem)
+
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_HE_KEY, &elem->key,
+			  NFT_DATA_VALUE, priv->klen) < 0)
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_HASH_MAP) {
+		if (nft_data_dump(skb, NFTA_HE_DATA, elem->data,
+				  NFT_DATA_VALUE, priv->dlen) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct hlist_node *next;
+	struct nft_hash_elem *elem;
+	unsigned int i;
+
+	for (i = 0; i < priv->hsize; i++) {
+		hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
+			hlist_del(&elem->hnode);
+			nft_hash_elem_destroy(expr, elem);
+		}
+	}
+	kfree(priv->hash);
+}
+
+static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
+	[NFTA_HASH_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_KLEN]	= { .type = NLA_U32 },
+	[NFTA_HASH_ELEMENTS]	= { .type = NLA_NESTED },
+};
+
+static int nft_hash_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			 const struct nlattr * const tb[])
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	struct nft_hash_elem *elem, *uninitialized_var(new);
+	const struct nlattr *nla;
+	unsigned int cnt, i;
+	unsigned int h;
+	int err, rem;
+
+	if (unlikely(!nft_hash_rnd_initted)) {
+		get_random_bytes(&nft_hash_rnd, 4);
+		nft_hash_rnd_initted = true;
+	}
+
+	if (tb[NFTA_HASH_SREG] == NULL ||
+	    tb[NFTA_HASH_KLEN] == NULL ||
+	    tb[NFTA_HASH_ELEMENTS] == NULL)
+		return -EINVAL;
+
+	if (tb[NFTA_HASH_FLAGS] != NULL) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_HASH_FLAGS]));
+		if (priv->flags & ~NFT_HASH_MAP)
+			return -EINVAL;
+	}
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_HASH_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_HASH_DREG] != NULL) {
+		if (!(priv->flags & NFT_HASH_MAP))
+			return -EINVAL;
+		priv->dreg = ntohl(nla_get_be32(tb[NFTA_HASH_DREG]));
+		err = nft_validate_output_register(priv->dreg);
+		if (err < 0)
+			return err;
+	}
+
+	priv->klen = ntohl(nla_get_be32(tb[NFTA_HASH_KLEN]));
+	if (priv->klen == 0)
+		return -EINVAL;
+
+	cnt = 0;
+	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
+		if (nla_type(nla) != NFTA_LIST_ELEM)
+			return -EINVAL;
+		cnt++;
+	}
+
+	/* Aim for a load factor of 0.75 */
+	cnt = cnt * 4 / 3;
+
+	priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
+	if (priv->hash == NULL)
+		return -ENOMEM;
+	priv->hsize = cnt;
+
+	for (i = 0; i < cnt; i++)
+		INIT_HLIST_HEAD(&priv->hash[i]);
+
+	err = -ENOMEM;
+	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
+		err = nft_hash_elem_init(ctx, expr, nla, &new);
+		if (err < 0)
+			goto err1;
+
+		h = nft_hash_data(&new->key, priv->hsize, priv->klen);
+		hlist_for_each_entry(elem, &priv->hash[h], hnode) {
+			if (nft_data_cmp(&elem->key, &new->key, priv->klen))
+				continue;
+			nft_hash_elem_destroy(expr, new);
+			err = -EEXIST;
+			goto err1;
+		}
+		hlist_add_head(&new->hnode, &priv->hash[h]);
+	}
+	return 0;
+
+err1:
+	nft_hash_destroy(ctx, expr);
+	return err;
+}
+
+static int nft_hash_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct nft_hash_elem *elem;
+	struct nlattr *list;
+	unsigned int i;
+
+	if (priv->flags)
+		if (nla_put_be32(skb, NFTA_HASH_FLAGS, htonl(priv->flags)))
+			goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HASH_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (priv->flags & NFT_HASH_MAP)
+		if (nla_put_be32(skb, NFTA_HASH_DREG, htonl(priv->dreg)))
+			goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HASH_KLEN, htonl(priv->klen)))
+		goto nla_put_failure;
+
+	list = nla_nest_start(skb, NFTA_HASH_ELEMENTS);
+	if (list == NULL)
+		goto nla_put_failure;
+
+	for (i = 0; i < priv->hsize; i++) {
+		hlist_for_each_entry(elem, &priv->hash[i], hnode) {
+			if (nft_hash_elem_dump(skb, expr, elem) < 0)
+				goto nla_put_failure;
+		}
+	}
+
+	nla_nest_end(skb, list);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_hash_ops __read_mostly = {
+	.name		= "hash",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_hash)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_hash_eval,
+	.init		= nft_hash_init,
+	.destroy	= nft_hash_destroy,
+	.dump		= nft_hash_dump,
+	.policy		= nft_hash_policy,
+	.maxattr	= NFTA_HASH_MAX,
+};
+
+static int __init nft_hash_module_init(void)
+{
+	return nft_register_expr(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+	nft_unregister_expr(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("hash");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
new file mode 100644
index 000000000000..3bf42c3cc49a
--- /dev/null
+++ b/net/netfilter/nft_immediate.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_immediate_expr {
+	struct nft_data		data;
+	enum nft_registers	dreg:8;
+	u8			dlen;
+};
+
+static void nft_immediate_eval(const struct nft_expr *expr,
+			       struct nft_data data[NFT_REG_MAX + 1],
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	nft_data_copy(&data[priv->dreg], &priv->data);
+}
+
+static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
+	[NFTA_IMMEDIATE_DREG]	= { .type = NLA_U32 },
+	[NFTA_IMMEDIATE_DATA]	= { .type = NLA_NESTED },
+};
+
+static int nft_immediate_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc;
+	int err;
+
+	if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
+	    tb[NFTA_IMMEDIATE_DATA] == NULL)
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+
+	err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
+	if (err < 0)
+		return err;
+	priv->dlen = desc.len;
+
+	err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+
+err1:
+	nft_data_uninit(&priv->data, desc.type);
+	return err;
+}
+
+static void nft_immediate_destroy(const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+
+	return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data,
+			     nft_dreg_to_type(priv->dreg), priv->dlen);
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_imm_ops __read_mostly = {
+	.name		= "immediate",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_immediate_eval,
+	.init		= nft_immediate_init,
+	.destroy	= nft_immediate_destroy,
+	.dump		= nft_immediate_dump,
+	.policy		= nft_immediate_policy,
+	.maxattr	= NFTA_IMMEDIATE_MAX,
+};
+
+int __init nft_immediate_module_init(void)
+{
+	return nft_register_expr(&nft_imm_ops);
+}
+
+void nft_immediate_module_exit(void)
+{
+	nft_unregister_expr(&nft_imm_ops);
+}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
new file mode 100644
index 000000000000..e0e3fc8aebc3
--- /dev/null
+++ b/net/netfilter/nft_limit.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+struct nft_limit {
+	u64		tokens;
+	u64		rate;
+	u64		unit;
+	unsigned long	stamp;
+};
+
+static void nft_limit_eval(const struct nft_expr *expr,
+			   struct nft_data data[NFT_REG_MAX + 1],
+			   const struct nft_pktinfo *pkt)
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+
+	spin_lock_bh(&limit_lock);
+	if (time_after_eq(jiffies, priv->stamp)) {
+		priv->tokens = priv->rate;
+		priv->stamp = jiffies + priv->unit * HZ;
+	}
+
+	if (priv->tokens >= 1) {
+		priv->tokens--;
+		spin_unlock_bh(&limit_lock);
+		return;
+	}
+	spin_unlock_bh(&limit_lock);
+
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 },
+	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 },
+};
+
+static int nft_limit_init(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_LIMIT_RATE] == NULL ||
+	    tb[NFTA_LIMIT_UNIT] == NULL)
+		return -EINVAL;
+
+	priv->rate   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+	priv->unit   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+	priv->stamp  = jiffies + priv->unit * HZ;
+	priv->tokens = priv->rate;
+	return 0;
+}
+
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_limit *priv = nft_expr_priv(expr);
+
+	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_limit_ops __read_mostly = {
+	.name		= "limit",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_limit_eval,
+	.init		= nft_limit_init,
+	.dump		= nft_limit_dump,
+	.policy		= nft_limit_policy,
+	.maxattr	= NFTA_LIMIT_MAX,
+};
+
+static int __init nft_limit_module_init(void)
+{
+	return nft_register_expr(&nft_limit_ops);
+}
+
+static void __exit nft_limit_module_exit(void)
+{
+	nft_unregister_expr(&nft_limit_ops);
+}
+
+module_init(nft_limit_module_init);
+module_exit(nft_limit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("limit");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
new file mode 100644
index 000000000000..da495c3b1e7e
--- /dev/null
+++ b/net/netfilter/nft_log.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netdevice.h>
+
+static const char *nft_log_null_prefix = "";
+
+struct nft_log {
+	struct nf_loginfo	loginfo;
+	char			*prefix;
+	int			family;
+};
+
+static void nft_log_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_log *priv = nft_expr_priv(expr);
+	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+	nf_log_packet(net, priv->family, pkt->hooknum, pkt->skb, pkt->in,
+		      pkt->out, &priv->loginfo, "%s", priv->prefix);
+}
+
+static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
+	[NFTA_LOG_GROUP]	= { .type = NLA_U16 },
+	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING },
+	[NFTA_LOG_SNAPLEN]	= { .type = NLA_U32 },
+	[NFTA_LOG_QTHRESHOLD]	= { .type = NLA_U16 },
+};
+
+static int nft_log_init(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_log *priv = nft_expr_priv(expr);
+	struct nf_loginfo *li = &priv->loginfo;
+	const struct nlattr *nla;
+
+	priv->family = ctx->afi->family;
+
+	nla = tb[NFTA_LOG_PREFIX];
+	if (nla != NULL) {
+		priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+		if (priv->prefix == NULL)
+			return -ENOMEM;
+		nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+	} else
+		priv->prefix = (char *)nft_log_null_prefix;
+
+	li->type = NF_LOG_TYPE_ULOG;
+	if (tb[NFTA_LOG_GROUP] != NULL)
+		li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP]));
+
+	if (tb[NFTA_LOG_SNAPLEN] != NULL)
+		li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN]));
+	if (tb[NFTA_LOG_QTHRESHOLD] != NULL) {
+		li->u.ulog.qthreshold =
+			ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD]));
+	}
+
+	return 0;
+}
+
+static void nft_log_destroy(const struct nft_expr *expr)
+{
+	struct nft_log *priv = nft_expr_priv(expr);
+
+	if (priv->prefix != nft_log_null_prefix)
+		kfree(priv->prefix);
+}
+
+static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_log *priv = nft_expr_priv(expr);
+	const struct nf_loginfo *li = &priv->loginfo;
+
+	if (priv->prefix != nft_log_null_prefix)
+		if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix))
+			goto nla_put_failure;
+	if (li->u.ulog.group)
+		if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group)))
+			goto nla_put_failure;
+	if (li->u.ulog.copy_len)
+		if (nla_put_be32(skb, NFTA_LOG_SNAPLEN,
+				 htonl(li->u.ulog.copy_len)))
+			goto nla_put_failure;
+	if (li->u.ulog.qthreshold)
+		if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD,
+				 htons(li->u.ulog.qthreshold)))
+			goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_log_ops __read_mostly = {
+	.name		= "log",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_log)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_log_eval,
+	.init		= nft_log_init,
+	.destroy	= nft_log_destroy,
+	.dump		= nft_log_dump,
+	.policy		= nft_log_policy,
+	.maxattr	= NFTA_LOG_MAX,
+};
+
+static int __init nft_log_module_init(void)
+{
+	return nft_register_expr(&nft_log_ops);
+}
+
+static void __exit nft_log_module_exit(void)
+{
+	nft_unregister_expr(&nft_log_ops);
+}
+
+module_init(nft_log_module_init);
+module_exit(nft_log_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("log");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
new file mode 100644
index 000000000000..96735aa2f039
--- /dev/null
+++ b/net/netfilter/nft_meta.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
+#include <net/netfilter/nf_tables.h>
+
+struct nft_meta {
+	enum nft_meta_keys	key:8;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_meta_eval(const struct nft_expr *expr,
+			  struct nft_data data[NFT_REG_MAX + 1],
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	const struct sk_buff *skb = pkt->skb;
+	const struct net_device *in = pkt->in, *out = pkt->out;
+	struct nft_data *dest = &data[priv->dreg];
+
+	switch (priv->key) {
+	case NFT_META_LEN:
+		dest->data[0] = skb->len;
+		break;
+	case NFT_META_PROTOCOL:
+		*(__be16 *)dest->data = skb->protocol;
+		break;
+	case NFT_META_PRIORITY:
+		dest->data[0] = skb->priority;
+		break;
+	case NFT_META_MARK:
+		dest->data[0] = skb->mark;
+		break;
+	case NFT_META_IIF:
+		if (in == NULL)
+			goto err;
+		dest->data[0] = in->ifindex;
+		break;
+	case NFT_META_OIF:
+		if (out == NULL)
+			goto err;
+		dest->data[0] = out->ifindex;
+		break;
+	case NFT_META_IIFNAME:
+		if (in == NULL)
+			goto err;
+		strncpy((char *)dest->data, in->name, sizeof(dest->data));
+		break;
+	case NFT_META_OIFNAME:
+		if (out == NULL)
+			goto err;
+		strncpy((char *)dest->data, out->name, sizeof(dest->data));
+		break;
+	case NFT_META_IIFTYPE:
+		if (in == NULL)
+			goto err;
+		*(u16 *)dest->data = in->type;
+		break;
+	case NFT_META_OIFTYPE:
+		if (out == NULL)
+			goto err;
+		*(u16 *)dest->data = out->type;
+		break;
+	case NFT_META_SKUID:
+		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+			goto err;
+
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket == NULL ||
+		    skb->sk->sk_socket->file == NULL) {
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			goto err;
+		}
+
+		dest->data[0] =
+			from_kuid_munged(&init_user_ns,
+				skb->sk->sk_socket->file->f_cred->fsuid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+		break;
+	case NFT_META_SKGID:
+		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+			goto err;
+
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket == NULL ||
+		    skb->sk->sk_socket->file == NULL) {
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			goto err;
+		}
+		dest->data[0] =
+			from_kgid_munged(&init_user_ns,
+				 skb->sk->sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+		break;
+#ifdef CONFIG_NET_CLS_ROUTE
+	case NFT_META_RTCLASSID: {
+		const struct dst_entry *dst = skb_dst(skb);
+
+		if (dst == NULL)
+			goto err;
+		dest->data[0] = dst->tclassid;
+		break;
+	}
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+		dest->data[0] = skb->secmark;
+		break;
+#endif
+	default:
+		WARN_ON(1);
+		goto err;
+	}
+	return;
+
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+	[NFTA_META_DREG]	= { .type = NLA_U32 },
+	[NFTA_META_KEY]		= { .type = NLA_U32 },
+};
+
+static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			 const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_META_DREG] == NULL ||
+	    tb[NFTA_META_KEY] == NULL)
+		return -EINVAL;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_LEN:
+	case NFT_META_PROTOCOL:
+	case NFT_META_PRIORITY:
+	case NFT_META_MARK:
+	case NFT_META_IIF:
+	case NFT_META_OIF:
+	case NFT_META_IIFNAME:
+	case NFT_META_OIFNAME:
+	case NFT_META_IIFTYPE:
+	case NFT_META_OIFTYPE:
+	case NFT_META_SKUID:
+	case NFT_META_SKGID:
+#ifdef CONFIG_NET_CLS_ROUTE
+	case NFT_META_RTCLASSID:
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+#endif
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_meta_ops __read_mostly = {
+	.name		= "meta",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_meta_eval,
+	.init		= nft_meta_init,
+	.dump		= nft_meta_dump,
+	.policy		= nft_meta_policy,
+	.maxattr	= NFTA_META_MAX,
+};
+
+static int __init nft_meta_module_init(void)
+{
+	return nft_register_expr(&nft_meta_ops);
+}
+
+static void __exit nft_meta_module_exit(void)
+{
+	nft_unregister_expr(&nft_meta_ops);
+}
+
+module_init(nft_meta_module_init);
+module_exit(nft_meta_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_meta_target.c b/net/netfilter/nft_meta_target.c
new file mode 100644
index 000000000000..71177df75ffb
--- /dev/null
+++ b/net/netfilter/nft_meta_target.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_meta {
+	enum nft_meta_keys	key;
+};
+
+static void nft_meta_eval(const struct nft_expr *expr,
+			  struct nft_data *nfres,
+			  struct nft_data *data,
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *meta = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	u32 val = data->data[0];
+
+	switch (meta->key) {
+	case NFT_META_MARK:
+		skb->mark = val;
+		break;
+	case NFT_META_PRIORITY:
+		skb->priority = val;
+		break;
+	case NFT_META_NFTRACE:
+		skb->nf_trace = val;
+		break;
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+		skb->secmark = val;
+		break;
+#endif
+	default:
+		WARN_ON(1);
+	}
+}
+
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+	[NFTA_META_KEY]		= { .type = NLA_U32 },
+};
+
+static int nft_meta_init(const struct nft_expr *expr, struct nlattr *tb[])
+{
+	struct nft_meta *meta = nft_expr_priv(expr);
+
+	if (tb[NFTA_META_KEY] == NULL)
+		return -EINVAL;
+
+	meta->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (meta->key) {
+	case NFT_META_MARK:
+	case NFT_META_PRIORITY:
+	case NFT_META_NFTRACE:
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+#endif
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_meta *meta = nft_expr_priv(expr);
+
+	NLA_PUT_BE32(skb, NFTA_META_KEY, htonl(meta->key));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops meta_target __read_mostly = {
+	.name		= "meta",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_meta_eval,
+	.init		= nft_meta_init,
+	.dump		= nft_meta_dump,
+	.policy		= nft_meta_policy,
+	.maxattr	= NFTA_META_MAX,
+};
+
+static int __init nft_meta_target_init(void)
+{
+	return nft_register_expr(&meta_target);
+}
+
+static void __exit nft_meta_target_exit(void)
+{
+	nft_unregister_expr(&meta_target);
+}
+
+module_init(nft_meta_target_init);
+module_exit(nft_meta_target_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
new file mode 100644
index 000000000000..329f134b3f89
--- /dev/null
+++ b/net/netfilter/nft_payload.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_payload {
+	enum nft_payload_bases	base:8;
+	u8			offset;
+	u8			len;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_payload_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+	const struct sk_buff *skb = pkt->skb;
+	struct nft_data *dest = &data[priv->dreg];
+	int offset;
+
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+		if (!skb_mac_header_was_set(skb))
+			goto err;
+		offset = skb_mac_header(skb) - skb->data;
+		break;
+	case NFT_PAYLOAD_NETWORK_HEADER:
+		offset = skb_network_offset(skb);
+		break;
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		offset = skb_transport_offset(skb);
+		break;
+	default:
+		BUG();
+	}
+	offset += priv->offset;
+
+	if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0)
+		goto err;
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
+	[NFTA_PAYLOAD_DREG]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_BASE]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_OFFSET]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_LEN]	= { .type = NLA_U32 },
+};
+
+static int nft_payload_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_payload *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_PAYLOAD_DREG] == NULL ||
+	    tb[NFTA_PAYLOAD_BASE] == NULL ||
+	    tb[NFTA_PAYLOAD_OFFSET] == NULL ||
+	    tb[NFTA_PAYLOAD_LEN] == NULL)
+		return -EINVAL;
+
+	priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+	case NFT_PAYLOAD_NETWORK_HEADER:
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+	priv->len    = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+	if (priv->len == 0 ||
+	    priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_payload_ops __read_mostly = {
+	.name		= "payload",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_payload_eval,
+	.init		= nft_payload_init,
+	.dump		= nft_payload_dump,
+	.policy		= nft_payload_policy,
+	.maxattr	= NFTA_PAYLOAD_MAX,
+};
+
+int __init nft_payload_module_init(void)
+{
+	return nft_register_expr(&nft_payload_ops);
+}
+
+void nft_payload_module_exit(void)
+{
+	nft_unregister_expr(&nft_payload_ops);
+}
diff --git a/net/netfilter/nft_set.c b/net/netfilter/nft_set.c
new file mode 100644
index 000000000000..7b7c8354c327
--- /dev/null
+++ b/net/netfilter/nft_set.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_set {
+	struct rb_root		root;
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			klen;
+	u8			dlen;
+	u16			flags;
+};
+
+struct nft_set_elem {
+	struct rb_node		node;
+	enum nft_set_elem_flags	flags;
+	struct nft_data		key;
+	struct nft_data		data[];
+};
+
+static void nft_set_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+	const struct rb_node *parent = priv->root.rb_node;
+	const struct nft_set_elem *elem, *interval = NULL;
+	const struct nft_data *key = &data[priv->sreg];
+	int d;
+
+	while (parent != NULL) {
+		elem = rb_entry(parent, struct nft_set_elem, node);
+
+		d = nft_data_cmp(&elem->key, key, priv->klen);
+		if (d < 0) {
+			parent = parent->rb_left;
+			interval = elem;
+		} else if (d > 0)
+			parent = parent->rb_right;
+		else {
+found:
+			if (elem->flags & NFT_SE_INTERVAL_END)
+				goto out;
+			if (priv->flags & NFT_SET_MAP)
+				nft_data_copy(&data[priv->dreg], elem->data);
+			return;
+		}
+	}
+
+	if (priv->flags & NFT_SET_INTERVAL && interval != NULL) {
+		elem = interval;
+		goto found;
+	}
+out:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_set_elem_destroy(const struct nft_expr *expr,
+				 struct nft_set_elem *elem)
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+
+	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
+	if (priv->flags & NFT_SET_MAP)
+		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
+	kfree(elem);
+}
+
+static const struct nla_policy nft_se_policy[NFTA_SE_MAX + 1] = {
+	[NFTA_SE_KEY]		= { .type = NLA_NESTED },
+	[NFTA_SE_DATA]		= { .type = NLA_NESTED },
+	[NFTA_SE_FLAGS]		= { .type = NLA_U32 },
+};
+
+static int nft_set_elem_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr *nla,
+			     struct nft_set_elem **new)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nlattr *tb[NFTA_SE_MAX + 1];
+	struct nft_set_elem *elem;
+	struct nft_data_desc d1, d2;
+	enum nft_set_elem_flags flags = 0;
+	unsigned int size;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_SE_MAX, nla, nft_se_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_SE_KEY] == NULL)
+		return -EINVAL;
+
+	if (tb[NFTA_SE_FLAGS] != NULL) {
+		flags = ntohl(nla_get_be32(tb[NFTA_SE_FLAGS]));
+		if (flags & ~NFT_SE_INTERVAL_END)
+			return -EINVAL;
+	}
+
+	size = sizeof(*elem);
+	if (priv->flags & NFT_SET_MAP) {
+		if (tb[NFTA_SE_DATA] == NULL && !(flags & NFT_SE_INTERVAL_END))
+			return -EINVAL;
+		size += sizeof(elem->data[0]);
+	} else {
+		if (tb[NFTA_SE_DATA] != NULL)
+			return -EINVAL;
+	}
+
+	elem = kzalloc(size, GFP_KERNEL);
+	if (elem == NULL)
+		return -ENOMEM;
+	elem->flags = flags;
+
+	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_SE_KEY]);
+	if (err < 0)
+		goto err1;
+	err = -EINVAL;
+	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
+		goto err2;
+
+	if (tb[NFTA_SE_DATA] != NULL) {
+		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_SE_DATA]);
+		if (err < 0)
+			goto err2;
+		err = -EINVAL;
+		if (priv->dreg != NFT_REG_VERDICT && d2.len != priv->dlen)
+			goto err2;
+		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
+		if (err < 0)
+			goto err3;
+	}
+
+	*new = elem;
+	return 0;
+
+err3:
+	nft_data_uninit(elem->data, d2.type);
+err2:
+	nft_data_uninit(&elem->key, d1.type);
+err1:
+	kfree(elem);
+	return err;
+}
+
+static int nft_set_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			     const struct nft_set_elem *elem)
+
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_SE_KEY, &elem->key,
+			  NFT_DATA_VALUE, priv->klen) < 0)
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_SET_MAP && !(elem->flags & NFT_SE_INTERVAL_END)) {
+		if (nft_data_dump(skb, NFTA_SE_DATA, elem->data,
+				  nft_dreg_to_type(priv->dreg), priv->dlen) < 0)
+			goto nla_put_failure;
+	}
+
+	if (elem->flags){
+		if (nla_put_be32(skb, NFTA_SE_FLAGS, htonl(elem->flags)))
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_set_destroy(const struct nft_expr *expr)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nft_set_elem *elem;
+	struct rb_node *node;
+
+	while ((node = priv->root.rb_node) != NULL) {
+		rb_erase(node, &priv->root);
+		elem = rb_entry(node, struct nft_set_elem, node);
+		nft_set_elem_destroy(expr, elem);
+	}
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+	[NFTA_SET_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_SET_SREG]		= { .type = NLA_U32 },
+	[NFTA_SET_DREG]		= { .type = NLA_U32 },
+	[NFTA_SET_KLEN]		= { .type = NLA_U32 },
+	[NFTA_SET_DLEN]		= { .type = NLA_U32 },
+	[NFTA_SET_ELEMENTS]	= { .type = NLA_NESTED },
+};
+
+static int nft_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nft_set_elem *elem, *uninitialized_var(new);
+	struct rb_node *parent, **p;
+	const struct nlattr *nla;
+	int err, rem, d;
+
+	if (tb[NFTA_SET_SREG] == NULL ||
+	    tb[NFTA_SET_KLEN] == NULL ||
+	    tb[NFTA_SET_ELEMENTS] == NULL)
+		return -EINVAL;
+
+	priv->root = RB_ROOT;
+
+	if (tb[NFTA_SET_FLAGS] != NULL) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_SET_FLAGS]));
+		if (priv->flags & ~(NFT_SET_INTERVAL | NFT_SET_MAP))
+			return -EINVAL;
+	}
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_SET_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_SET_DREG] != NULL) {
+		if (!(priv->flags & NFT_SET_MAP))
+			return -EINVAL;
+		if (tb[NFTA_SET_DLEN] == NULL)
+			return -EINVAL;
+
+		priv->dreg = ntohl(nla_get_be32(tb[NFTA_SET_DREG]));
+		err = nft_validate_output_register(priv->dreg);
+		if (err < 0)
+			return err;
+
+		if (priv->dreg == NFT_REG_VERDICT)
+			priv->dlen = FIELD_SIZEOF(struct nft_data, data);
+		else {
+			priv->dlen = ntohl(nla_get_be32(tb[NFTA_SET_DLEN]));
+			if (priv->dlen == 0 ||
+			    priv->dlen > FIELD_SIZEOF(struct nft_data, data))
+				return -EINVAL;
+		}
+	} else {
+		if (priv->flags & NFT_SET_MAP)
+			return -EINVAL;
+		if (tb[NFTA_SET_DLEN] != NULL)
+			return -EINVAL;
+	}
+
+	priv->klen = ntohl(nla_get_be32(tb[NFTA_SET_KLEN]));
+	if (priv->klen == 0 ||
+	    priv->klen > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	nla_for_each_nested(nla, tb[NFTA_SET_ELEMENTS], rem) {
+		err = -EINVAL;
+		if (nla_type(nla) != NFTA_LIST_ELEM)
+			goto err1;
+
+		err = nft_set_elem_init(ctx, expr, nla, &new);
+		if (err < 0)
+			goto err1;
+
+		parent = NULL;
+		p = &priv->root.rb_node;
+		while (*p != NULL) {
+			parent = *p;
+			elem = rb_entry(parent, struct nft_set_elem, node);
+			d = nft_data_cmp(&elem->key, &new->key, priv->klen);
+			if (d < 0)
+				p = &parent->rb_left;
+			else if (d > 0)
+				p = &parent->rb_right;
+			else {
+				err = -EEXIST;
+				goto err2;
+			}
+		}
+		rb_link_node(&new->node, parent, p);
+		rb_insert_color(&new->node, &priv->root);
+	}
+
+	return 0;
+
+err2:
+	nft_set_elem_destroy(expr, new);
+err1:
+	nft_set_destroy(expr);
+	return err;
+}
+
+static int nft_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	const struct nft_set_elem *elem;
+	struct rb_node *node;
+	struct nlattr *list;
+
+	if (priv->flags) {
+		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(priv->flags)))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_be32(skb, NFTA_SET_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_SET_KLEN, htonl(priv->klen)))
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_SET_MAP) {
+		if (nla_put_be32(skb, NFTA_SET_DREG, htonl(priv->dreg)))
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_SET_DLEN, htonl(priv->dlen)))
+			goto nla_put_failure;
+	}
+
+	list = nla_nest_start(skb, NFTA_SET_ELEMENTS);
+	if (list == NULL)
+		goto nla_put_failure;
+
+	for (node = rb_first(&priv->root); node; node = rb_next(node)) {
+		elem = rb_entry(node, struct nft_set_elem, node);
+		if (nft_set_elem_dump(skb, expr, elem) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, list);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_set_ops __read_mostly = {
+	.name		= "set",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_set)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_set_eval,
+	.init		= nft_set_init,
+	.destroy	= nft_set_destroy,
+	.dump		= nft_set_dump,
+	.policy		= nft_set_policy,
+	.maxattr	= NFTA_SET_MAX,
+};
+
+static int __init nft_set_module_init(void)
+{
+	return nft_register_expr(&nft_set_ops);
+}
+
+static void __exit nft_set_module_exit(void)
+{
+	nft_unregister_expr(&nft_set_ops);
+}
+
+module_init(nft_set_module_init);
+module_exit(nft_set_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("set");
-- 
cgit v1.2.3


From 20a69341f2d00cd042e81c82289fba8a13c05a25 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 11 Oct 2013 12:06:22 +0200
Subject: netfilter: nf_tables: add netlink set API

This patch adds the new netlink API for maintaining nf_tables sets
independently of the ruleset. The API supports the following operations:

- creation of sets
- deletion of sets
- querying of specific sets
- dumping of all sets

- addition of set elements
- removal of set elements
- dumping of all set elements

Sets are identified by name, each table defines an individual namespace.
The name of a set may be allocated automatically, this is mostly useful
in combination with the NFT_SET_ANONYMOUS flag, which destroys a set
automatically once the last reference has been released.

Sets can be marked constant, meaning they're not allowed to change while
linked to a rule. This allows to perform lockless operation for set
types that would otherwise require locking.

Additionally, if the implementation supports it, sets can (as before) be
used as maps, associating a data value with each key (or range), by
specifying the NFT_SET_MAP flag and can be used for interval queries by
specifying the NFT_SET_INTERVAL flag.

Set elements are added and removed incrementally. All element operations
support batching, reducing netlink message and set lookup overhead.

The old "set" and "hash" expressions are replaced by a generic "lookup"
expression, which binds to the specified set. Userspace is not aware
of the actual set implementation used by the kernel anymore, all
configuration options are generic.

Currently the implementation selection logic is largely missing and the
kernel will simply use the first registered implementation supporting the
requested operation. Eventually, the plan is to have userspace supply a
description of the data characteristics and select the implementation
based on expected performance and memory use.

This patch includes the new 'lookup' expression to look up for element
matching in the set.

This patch includes kernel-doc descriptions for this set API and it
also includes the following fixes.

From Patrick McHardy:
* netfilter: nf_tables: fix set element data type in dumps
* netfilter: nf_tables: fix indentation of struct nft_set_elem comments
* netfilter: nf_tables: fix oops in nft_validate_data_load()
* netfilter: nf_tables: fix oops while listing sets of built-in tables
* netfilter: nf_tables: destroy anonymous sets immediately if binding fails
* netfilter: nf_tables: propagate context to set iter callback
* netfilter: nf_tables: add loop detection

From Pablo Neira Ayuso:
* netfilter: nf_tables: allow to dump all existing sets
* netfilter: nf_tables: fix wrong type for flags variable in newelem

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  149 ++++-
 include/uapi/linux/netfilter/nf_tables.h |  191 ++++--
 net/netfilter/Kconfig                    |    6 +-
 net/netfilter/Makefile                   |    2 +-
 net/netfilter/nf_tables_api.c            | 1078 +++++++++++++++++++++++++++++-
 net/netfilter/nf_tables_core.c           |    2 -
 net/netfilter/nft_hash.c                 |  329 +++------
 net/netfilter/nft_immediate.c            |   11 +
 net/netfilter/nft_lookup.c               |  135 ++++
 net/netfilter/nft_rbtree.c               |  247 +++++++
 net/netfilter/nft_set.c                  |  381 -----------
 11 files changed, 1854 insertions(+), 677 deletions(-)
 create mode 100644 net/netfilter/nft_lookup.c
 create mode 100644 net/netfilter/nft_rbtree.c
 delete mode 100644 net/netfilter/nft_set.c

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d26dfa345f49..677dd79380ed 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -6,6 +6,8 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netlink.h>
 
+#define NFT_JUMP_STACK_SIZE	16
+
 struct nft_pktinfo {
 	struct sk_buff			*skb;
 	const struct net_device		*in;
@@ -48,23 +50,22 @@ static inline void nft_data_debug(const struct nft_data *data)
 }
 
 /**
- *	struct nft_ctx - nf_tables rule context
+ *	struct nft_ctx - nf_tables rule/set context
  *
+ * 	@skb: netlink skb
+ * 	@nlh: netlink message header
  * 	@afi: address family info
  * 	@table: the table the chain is contained in
  * 	@chain: the chain the rule is contained in
  */
 struct nft_ctx {
+	const struct sk_buff		*skb;
+	const struct nlmsghdr		*nlh;
 	const struct nft_af_info	*afi;
 	const struct nft_table		*table;
 	const struct nft_chain		*chain;
 };
 
-enum nft_data_types {
-	NFT_DATA_VALUE,
-	NFT_DATA_VERDICT,
-};
-
 struct nft_data_desc {
 	enum nft_data_types		type;
 	unsigned int			len;
@@ -83,6 +84,11 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
 	return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
 }
 
+static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
+{
+	return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1;
+}
+
 extern int nft_validate_input_register(enum nft_registers reg);
 extern int nft_validate_output_register(enum nft_registers reg);
 extern int nft_validate_data_load(const struct nft_ctx *ctx,
@@ -90,6 +96,132 @@ extern int nft_validate_data_load(const struct nft_ctx *ctx,
 				  const struct nft_data *data,
 				  enum nft_data_types type);
 
+/**
+ *	struct nft_set_elem - generic representation of set elements
+ *
+ *	@cookie: implementation specific element cookie
+ *	@key: element key
+ *	@data: element data (maps only)
+ *	@flags: element flags (end of interval)
+ *
+ *	The cookie can be used to store a handle to the element for subsequent
+ *	removal.
+ */
+struct nft_set_elem {
+	void			*cookie;
+	struct nft_data		key;
+	struct nft_data		data;
+	u32			flags;
+};
+
+struct nft_set;
+struct nft_set_iter {
+	unsigned int	count;
+	unsigned int	skip;
+	int		err;
+	int		(*fn)(const struct nft_ctx *ctx,
+			      const struct nft_set *set,
+			      const struct nft_set_iter *iter,
+			      const struct nft_set_elem *elem);
+};
+
+/**
+ *	struct nft_set_ops - nf_tables set operations
+ *
+ *	@lookup: look up an element within the set
+ *	@insert: insert new element into set
+ *	@remove: remove element from set
+ *	@walk: iterate over all set elemeennts
+ *	@privsize: function to return size of set private data
+ *	@init: initialize private data of new set instance
+ *	@destroy: destroy private data of set instance
+ *	@list: nf_tables_set_ops list node
+ *	@owner: module reference
+ *	@features: features supported by the implementation
+ */
+struct nft_set_ops {
+	bool				(*lookup)(const struct nft_set *set,
+						  const struct nft_data *key,
+						  struct nft_data *data);
+	int				(*get)(const struct nft_set *set,
+					       struct nft_set_elem *elem);
+	int				(*insert)(const struct nft_set *set,
+						  const struct nft_set_elem *elem);
+	void				(*remove)(const struct nft_set *set,
+						  const struct nft_set_elem *elem);
+	void				(*walk)(const struct nft_ctx *ctx,
+						const struct nft_set *set,
+						struct nft_set_iter *iter);
+
+	unsigned int			(*privsize)(const struct nlattr * const nla[]);
+	int				(*init)(const struct nft_set *set,
+						const struct nlattr * const nla[]);
+	void				(*destroy)(const struct nft_set *set);
+
+	struct list_head		list;
+	struct module			*owner;
+	u32				features;
+};
+
+extern int nft_register_set(struct nft_set_ops *ops);
+extern void nft_unregister_set(struct nft_set_ops *ops);
+
+/**
+ * 	struct nft_set - nf_tables set instance
+ *
+ *	@list: table set list node
+ *	@bindings: list of set bindings
+ * 	@name: name of the set
+ * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
+ * 	@dtype: data type (verdict or numeric type defined by userspace)
+ * 	@ops: set ops
+ * 	@flags: set flags
+ * 	@klen: key length
+ * 	@dlen: data length
+ * 	@data: private set data
+ */
+struct nft_set {
+	struct list_head		list;
+	struct list_head		bindings;
+	char				name[IFNAMSIZ];
+	u32				ktype;
+	u32				dtype;
+	/* runtime data below here */
+	const struct nft_set_ops	*ops ____cacheline_aligned;
+	u16				flags;
+	u8				klen;
+	u8				dlen;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(u64))));
+};
+
+static inline void *nft_set_priv(const struct nft_set *set)
+{
+	return (void *)set->data;
+}
+
+extern struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+					    const struct nlattr *nla);
+
+/**
+ *	struct nft_set_binding - nf_tables set binding
+ *
+ *	@list: set bindings list node
+ *	@chain: chain containing the rule bound to the set
+ *
+ *	A set binding contains all information necessary for validation
+ *	of new elements added to a bound set.
+ */
+struct nft_set_binding {
+	struct list_head		list;
+	const struct nft_chain		*chain;
+};
+
+extern int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
+			      struct nft_set_binding *binding);
+extern void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+				 struct nft_set_binding *binding);
+
 /**
  *	struct nft_expr_ops - nf_tables expression operations
  *
@@ -115,7 +247,7 @@ struct nft_expr_ops {
 	void				(*destroy)(const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
-
+	const struct nft_data *		(*get_verdict)(const struct nft_expr *expr);
 	struct list_head		list;
 	const char			*name;
 	struct module			*owner;
@@ -298,4 +430,7 @@ extern void nft_unregister_expr(struct nft_expr_ops *);
 #define MODULE_ALIAS_NFT_EXPR(name) \
 	MODULE_ALIAS("nft-expr-" name)
 
+#define MODULE_ALIAS_NFT_SET() \
+	MODULE_ALIAS("nft-set")
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ec6d84a8ed1e..9e924014efe3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -44,6 +44,12 @@ enum nft_verdicts {
  * @NFT_MSG_NEWRULE: create a new rule (enum nft_rule_attributes)
  * @NFT_MSG_GETRULE: get a rule (enum nft_rule_attributes)
  * @NFT_MSG_DELRULE: delete a rule (enum nft_rule_attributes)
+ * @NFT_MSG_NEWSET: create a new set (enum nft_set_attributes)
+ * @NFT_MSG_GETSET: get a set (enum nft_set_attributes)
+ * @NFT_MSG_DELSET: delete a set (enum nft_set_attributes)
+ * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes)
+ * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes)
+ * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -55,9 +61,20 @@ enum nf_tables_msg_types {
 	NFT_MSG_NEWRULE,
 	NFT_MSG_GETRULE,
 	NFT_MSG_DELRULE,
+	NFT_MSG_NEWSET,
+	NFT_MSG_GETSET,
+	NFT_MSG_DELSET,
+	NFT_MSG_NEWSETELEM,
+	NFT_MSG_GETSETELEM,
+	NFT_MSG_DELSETELEM,
 	NFT_MSG_MAX,
 };
 
+/**
+ * enum nft_list_attributes - nf_tables generic list netlink attributes
+ *
+ * @NFTA_LIST_ELEM: list element (NLA_NESTED)
+ */
 enum nft_list_attributes {
 	NFTA_LIST_UNPEC,
 	NFTA_LIST_ELEM,
@@ -127,6 +144,113 @@ enum nft_rule_attributes {
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
 
+/**
+ * enum nft_set_flags - nf_tables set flags
+ *
+ * @NFT_SET_ANONYMOUS: name allocation, automatic cleanup on unlink
+ * @NFT_SET_CONSTANT: set contents may not change while bound
+ * @NFT_SET_INTERVAL: set contains intervals
+ * @NFT_SET_MAP: set is used as a dictionary
+ */
+enum nft_set_flags {
+	NFT_SET_ANONYMOUS		= 0x1,
+	NFT_SET_CONSTANT		= 0x2,
+	NFT_SET_INTERVAL		= 0x4,
+	NFT_SET_MAP			= 0x8,
+};
+
+/**
+ * enum nft_set_attributes - nf_tables set netlink attributes
+ *
+ * @NFTA_SET_TABLE: table name (NLA_STRING)
+ * @NFTA_SET_NAME: set name (NLA_STRING)
+ * @NFTA_SET_FLAGS: bitmask of enum nft_set_flags (NLA_U32)
+ * @NFTA_SET_KEY_TYPE: key data type, informational purpose only (NLA_U32)
+ * @NFTA_SET_KEY_LEN: key data length (NLA_U32)
+ * @NFTA_SET_DATA_TYPE: mapping data type (NLA_U32)
+ * @NFTA_SET_DATA_LEN: mapping data length (NLA_U32)
+ */
+enum nft_set_attributes {
+	NFTA_SET_UNSPEC,
+	NFTA_SET_TABLE,
+	NFTA_SET_NAME,
+	NFTA_SET_FLAGS,
+	NFTA_SET_KEY_TYPE,
+	NFTA_SET_KEY_LEN,
+	NFTA_SET_DATA_TYPE,
+	NFTA_SET_DATA_LEN,
+	__NFTA_SET_MAX
+};
+#define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
+
+/**
+ * enum nft_set_elem_flags - nf_tables set element flags
+ *
+ * @NFT_SET_ELEM_INTERVAL_END: element ends the previous interval
+ */
+enum nft_set_elem_flags {
+	NFT_SET_ELEM_INTERVAL_END	= 0x1,
+};
+
+/**
+ * enum nft_set_elem_attributes - nf_tables set element netlink attributes
+ *
+ * @NFTA_SET_ELEM_KEY: key value (NLA_NESTED: nft_data)
+ * @NFTA_SET_ELEM_DATA: data value of mapping (NLA_NESTED: nft_data_attributes)
+ * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32)
+ */
+enum nft_set_elem_attributes {
+	NFTA_SET_ELEM_UNSPEC,
+	NFTA_SET_ELEM_KEY,
+	NFTA_SET_ELEM_DATA,
+	NFTA_SET_ELEM_FLAGS,
+	__NFTA_SET_ELEM_MAX
+};
+#define NFTA_SET_ELEM_MAX	(__NFTA_SET_ELEM_MAX - 1)
+
+/**
+ * enum nft_set_elem_list_attributes - nf_tables set element list netlink attributes
+ *
+ * @NFTA_SET_ELEM_LIST_TABLE: table of the set to be changed (NLA_STRING)
+ * @NFTA_SET_ELEM_LIST_SET: name of the set to be changed (NLA_STRING)
+ * @NFTA_SET_ELEM_LIST_ELEMENTS: list of set elements (NLA_NESTED: nft_set_elem_attributes)
+ */
+enum nft_set_elem_list_attributes {
+	NFTA_SET_ELEM_LIST_UNSPEC,
+	NFTA_SET_ELEM_LIST_TABLE,
+	NFTA_SET_ELEM_LIST_SET,
+	NFTA_SET_ELEM_LIST_ELEMENTS,
+	__NFTA_SET_ELEM_LIST_MAX
+};
+#define NFTA_SET_ELEM_LIST_MAX	(__NFTA_SET_ELEM_LIST_MAX - 1)
+
+/**
+ * enum nft_data_types - nf_tables data types
+ *
+ * @NFT_DATA_VALUE: generic data
+ * @NFT_DATA_VERDICT: netfilter verdict
+ *
+ * The type of data is usually determined by the kernel directly and is not
+ * explicitly specified by userspace. The only difference are sets, where
+ * userspace specifies the key and mapping data types.
+ *
+ * The values 0xffffff00-0xffffffff are reserved for internally used types.
+ * The remaining range can be freely used by userspace to encode types, all
+ * values are equivalent to NFT_DATA_VALUE.
+ */
+enum nft_data_types {
+	NFT_DATA_VALUE,
+	NFT_DATA_VERDICT	= 0xffffff00U,
+};
+
+#define NFT_DATA_RESERVED_MASK	0xffffff00U
+
+/**
+ * enum nft_data_attributes - nf_tables data netlink attributes
+ *
+ * @NFTA_DATA_VALUE: generic data (NLA_BINARY)
+ * @NFTA_DATA_VERDICT: nf_tables verdict (NLA_NESTED: nft_verdict_attributes)
+ */
 enum nft_data_attributes {
 	NFTA_DATA_UNSPEC,
 	NFTA_DATA_VALUE,
@@ -275,58 +399,21 @@ enum nft_cmp_attributes {
 };
 #define NFTA_CMP_MAX		(__NFTA_CMP_MAX - 1)
 
-enum nft_set_elem_flags {
-	NFT_SE_INTERVAL_END	= 0x1,
-};
-
-enum nft_set_elem_attributes {
-	NFTA_SE_UNSPEC,
-	NFTA_SE_KEY,
-	NFTA_SE_DATA,
-	NFTA_SE_FLAGS,
-	__NFTA_SE_MAX
-};
-#define NFTA_SE_MAX		(__NFTA_SE_MAX - 1)
-
-enum nft_set_flags {
-	NFT_SET_INTERVAL	= 0x1,
-	NFT_SET_MAP		= 0x2,
-};
-
-enum nft_set_attributes {
-	NFTA_SET_UNSPEC,
-	NFTA_SET_FLAGS,
-	NFTA_SET_SREG,
-	NFTA_SET_DREG,
-	NFTA_SET_KLEN,
-	NFTA_SET_DLEN,
-	NFTA_SET_ELEMENTS,
-	__NFTA_SET_MAX
-};
-#define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
-
-enum nft_hash_flags {
-	NFT_HASH_MAP		= 0x1,
-};
-
-enum nft_hash_elem_attributes {
-	NFTA_HE_UNSPEC,
-	NFTA_HE_KEY,
-	NFTA_HE_DATA,
-	__NFTA_HE_MAX
-};
-#define NFTA_HE_MAX		(__NFTA_HE_MAX - 1)
-
-enum nft_hash_attributes {
-	NFTA_HASH_UNSPEC,
-	NFTA_HASH_FLAGS,
-	NFTA_HASH_SREG,
-	NFTA_HASH_DREG,
-	NFTA_HASH_KLEN,
-	NFTA_HASH_ELEMENTS,
-	__NFTA_HASH_MAX
-};
-#define NFTA_HASH_MAX		(__NFTA_HASH_MAX - 1)
+/**
+ * enum nft_lookup_attributes - nf_tables set lookup expression netlink attributes
+ *
+ * @NFTA_LOOKUP_SET: name of the set where to look for (NLA_STRING)
+ * @NFTA_LOOKUP_SREG: source register of the data to look for (NLA_U32: nft_registers)
+ * @NFTA_LOOKUP_DREG: destination register (NLA_U32: nft_registers)
+ */
+enum nft_lookup_attributes {
+	NFTA_LOOKUP_UNSPEC,
+	NFTA_LOOKUP_SET,
+	NFTA_LOOKUP_SREG,
+	NFTA_LOOKUP_DREG,
+	__NFTA_LOOKUP_MAX
+};
+#define NFTA_LOOKUP_MAX		(__NFTA_LOOKUP_MAX - 1)
 
 /**
  * enum nft_payload_bases - nf_tables payload expression offset bases
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index c271e1af93b5..aa184a46bbf3 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -430,13 +430,13 @@ config NFT_CT
 	depends on NF_CONNTRACK
 	tristate "Netfilter nf_tables conntrack module"
 
-config NFT_SET
+config NFT_RBTREE
 	depends on NF_TABLES
-	tristate "Netfilter nf_tables set module"
+	tristate "Netfilter nf_tables rbtree set module"
 
 config NFT_HASH
 	depends on NF_TABLES
-	tristate "Netfilter nf_tables hash module"
+	tristate "Netfilter nf_tables hash set module"
 
 config NFT_COUNTER
 	depends on NF_TABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1ca3f3932826..b6b78754e4cc 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -75,7 +75,7 @@ obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 #nf_tables-objs			+= nft_meta_target.o
-obj-$(CONFIG_NFT_SET)		+= nft_set.o
+obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o
 obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
 obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7d59c89c6c75..5092c817c222 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -315,6 +315,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 
 	nla_strlcpy(table->name, name, nla_len(name));
 	INIT_LIST_HEAD(&table->chains);
+	INIT_LIST_HEAD(&table->sets);
 
 	list_add_tail(&table->list, &afi->tables);
 	nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family);
@@ -409,6 +410,7 @@ again:
 	}
 
 	table->flags |= NFT_TABLE_BUILTIN;
+	INIT_LIST_HEAD(&table->sets);
 	list_add_tail(&table->list, &afi->tables);
 	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
 	list_for_each_entry(chain, &table->chains, list)
@@ -820,10 +822,14 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 }
 
 static void nft_ctx_init(struct nft_ctx *ctx,
+			 const struct sk_buff *skb,
+			 const struct nlmsghdr *nlh,
 			 const struct nft_af_info *afi,
 			 const struct nft_table *table,
 			 const struct nft_chain *chain)
 {
+	ctx->skb   = skb;
+	ctx->nlh   = nlh;
 	ctx->afi   = afi;
 	ctx->table = table;
 	ctx->chain = chain;
@@ -1301,7 +1307,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	rule->handle = handle;
 	rule->dlen   = size;
 
-	nft_ctx_init(&ctx, afi, table, chain);
+	nft_ctx_init(&ctx, skb, nlh, afi, table, chain);
 	expr = nft_expr_first(rule);
 	for (i = 0; i < n; i++) {
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
@@ -1392,6 +1398,939 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	return 0;
 }
 
+/*
+ * Sets
+ */
+
+static LIST_HEAD(nf_tables_set_ops);
+
+int nft_register_set(struct nft_set_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail(&ops->list, &nf_tables_set_ops);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_set);
+
+void nft_unregister_set(struct nft_set_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&ops->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_set);
+
+static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[])
+{
+	const struct nft_set_ops *ops;
+	u32 features;
+
+#ifdef CONFIG_MODULES
+	if (list_empty(&nf_tables_set_ops)) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-set");
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (!list_empty(&nf_tables_set_ops))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	features = 0;
+	if (nla[NFTA_SET_FLAGS] != NULL) {
+		features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+		features &= NFT_SET_INTERVAL | NFT_SET_MAP;
+	}
+
+	// FIXME: implement selection properly
+	list_for_each_entry(ops, &nf_tables_set_ops, list) {
+		if ((ops->features & features) != features)
+			continue;
+		if (!try_module_get(ops->owner))
+			continue;
+		return ops;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+	[NFTA_SET_TABLE]		= { .type = NLA_STRING },
+	[NFTA_SET_NAME]			= { .type = NLA_STRING },
+	[NFTA_SET_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_SET_KEY_TYPE]		= { .type = NLA_U32 },
+	[NFTA_SET_KEY_LEN]		= { .type = NLA_U32 },
+	[NFTA_SET_DATA_TYPE]		= { .type = NLA_U32 },
+	[NFTA_SET_DATA_LEN]		= { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+				     const struct sk_buff *skb,
+				     const struct nlmsghdr *nlh,
+				     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table = NULL;
+
+	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	if (nla[NFTA_SET_TABLE] != NULL) {
+		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], false);
+		if (IS_ERR(table))
+			return PTR_ERR(table);
+	}
+
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	return 0;
+}
+
+struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+				     const struct nlattr *nla)
+{
+	struct nft_set *set;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	list_for_each_entry(set, &table->sets, list) {
+		if (!nla_strcmp(nla, set->name))
+			return set;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+				    const char *name)
+{
+	const struct nft_set *i;
+	const char *p;
+	unsigned long *inuse;
+	unsigned int n = 0;
+
+	p = strnchr(name, IFNAMSIZ, '%');
+	if (p != NULL) {
+		if (p[1] != 'd' || strchr(p + 2, '%'))
+			return -EINVAL;
+
+		inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+		if (inuse == NULL)
+			return -ENOMEM;
+
+		list_for_each_entry(i, &ctx->table->sets, list) {
+			if (!sscanf(i->name, name, &n))
+				continue;
+			if (n < 0 || n > BITS_PER_LONG * PAGE_SIZE)
+				continue;
+			set_bit(n, inuse);
+		}
+
+		n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE);
+		free_page((unsigned long)inuse);
+	}
+
+	snprintf(set->name, sizeof(set->name), name, n);
+	list_for_each_entry(i, &ctx->table->sets, list) {
+		if (!strcmp(set->name, i->name))
+			return -ENFILE;
+	}
+	return 0;
+}
+
+static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
+			      const struct nft_set *set, u16 event, u16 flags)
+{
+	struct nfgenmsg *nfmsg;
+	struct nlmsghdr *nlh;
+	u32 portid = NETLINK_CB(ctx->skb).portid;
+	u32 seq = ctx->nlh->nlmsg_seq;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+			flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= ctx->afi->family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+		goto nla_put_failure;
+	if (set->flags != 0)
+		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
+			goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
+		goto nla_put_failure;
+	if (set->flags & NFT_SET_MAP) {
+		if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
+			goto nla_put_failure;
+	}
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_set_notify(const struct nft_ctx *ctx,
+				const struct nft_set *set,
+				int event)
+{
+	struct sk_buff *skb;
+	u32 portid = NETLINK_CB(ctx->skb).portid;
+	struct net *net = sock_net(ctx->skb->sk);
+	bool report;
+	int err;
+
+	report = nlmsg_report(ctx->nlh);
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_set(skb, ctx, set, event, 0);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	const struct nft_set *set;
+	unsigned int idx = 0, s_idx = cb->args[0];
+
+	if (cb->args[1])
+		return skb->len;
+
+	list_for_each_entry(set, &ctx->table->sets, list) {
+		if (idx < s_idx)
+			goto cont;
+		if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+				       NLM_F_MULTI) < 0) {
+			cb->args[0] = idx;
+			goto done;
+		}
+cont:
+		idx++;
+	}
+	cb->args[1] = 1;
+done:
+	return skb->len;
+}
+
+static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	const struct nft_set *set;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+
+	if (cb->args[1])
+		return skb->len;
+
+	list_for_each_entry(table, &ctx->afi->tables, list) {
+		if (cur_table && cur_table != table)
+			continue;
+
+		ctx->table = table;
+		list_for_each_entry(set, &ctx->table->sets, list) {
+			if (idx < s_idx)
+				goto cont;
+			if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+					       NLM_F_MULTI) < 0) {
+				cb->args[0] = idx;
+				cb->args[2] = (unsigned long) table;
+				goto done;
+			}
+cont:
+			idx++;
+		}
+	}
+	cb->args[1] = 1;
+done:
+	return skb->len;
+}
+
+static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	struct nlattr *nla[NFTA_SET_MAX + 1];
+	struct nft_ctx ctx;
+	int err, ret;
+
+	err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX,
+			  nft_set_policy);
+	if (err < 0)
+		return err;
+
+	err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla);
+	if (err < 0)
+		return err;
+
+	if (ctx.table == NULL)
+		ret = nf_tables_dump_sets_all(&ctx, skb, cb);
+	else
+		ret = nf_tables_dump_sets_table(&ctx, skb, cb);
+
+	return ret;
+}
+
+static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
+			    const struct nlmsghdr *nlh,
+			    const struct nlattr * const nla[])
+{
+	const struct nft_set *set;
+	struct nft_ctx ctx;
+	struct sk_buff *skb2;
+	int err;
+
+	/* Verify existance before starting dump */
+	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+	if (err < 0)
+		return err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_sets,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
+			    const struct nlmsghdr *nlh,
+			    const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_set_ops *ops;
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_set *set;
+	struct nft_ctx ctx;
+	char name[IFNAMSIZ];
+	unsigned int size;
+	bool create;
+	u32 ktype, klen, dlen, dtype, flags;
+	int err;
+
+	if (nla[NFTA_SET_TABLE] == NULL ||
+	    nla[NFTA_SET_NAME] == NULL ||
+	    nla[NFTA_SET_KEY_LEN] == NULL)
+		return -EINVAL;
+
+	ktype = NFT_DATA_VALUE;
+	if (nla[NFTA_SET_KEY_TYPE] != NULL) {
+		ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+		if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+			return -EINVAL;
+	}
+
+	klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+	if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	flags = 0;
+	if (nla[NFTA_SET_FLAGS] != NULL) {
+		flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
+			      NFT_SET_INTERVAL | NFT_SET_MAP))
+			return -EINVAL;
+	}
+
+	dtype = 0;
+	dlen  = 0;
+	if (nla[NFTA_SET_DATA_TYPE] != NULL) {
+		if (!(flags & NFT_SET_MAP))
+			return -EINVAL;
+
+		dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+		if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+		    dtype != NFT_DATA_VERDICT)
+			return -EINVAL;
+
+		if (dtype != NFT_DATA_VERDICT) {
+			if (nla[NFTA_SET_DATA_LEN] == NULL)
+				return -EINVAL;
+			dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
+			if (dlen == 0 ||
+			    dlen > FIELD_SIZEOF(struct nft_data, data))
+				return -EINVAL;
+		} else
+			dlen = sizeof(struct nft_data);
+	} else if (flags & NFT_SET_MAP)
+		return -EINVAL;
+
+	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], create);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL);
+
+	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+	if (IS_ERR(set)) {
+		if (PTR_ERR(set) != -ENOENT)
+			return PTR_ERR(set);
+		set = NULL;
+	}
+
+	if (set != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EOPNOTSUPP;
+		return 0;
+	}
+
+	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+		return -ENOENT;
+
+	ops = nft_select_set_ops(nla);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	size = 0;
+	if (ops->privsize != NULL)
+		size = ops->privsize(nla);
+
+	err = -ENOMEM;
+	set = kzalloc(sizeof(*set) + size, GFP_KERNEL);
+	if (set == NULL)
+		goto err1;
+
+	nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+	err = nf_tables_set_alloc_name(&ctx, set, name);
+	if (err < 0)
+		goto err2;
+
+	INIT_LIST_HEAD(&set->bindings);
+	set->ops   = ops;
+	set->ktype = ktype;
+	set->klen  = klen;
+	set->dtype = dtype;
+	set->dlen  = dlen;
+	set->flags = flags;
+
+	err = ops->init(set, nla);
+	if (err < 0)
+		goto err2;
+
+	list_add_tail(&set->list, &table->sets);
+	nf_tables_set_notify(&ctx, set, NFT_MSG_NEWSET);
+	return 0;
+
+err2:
+	kfree(set);
+err1:
+	module_put(ops->owner);
+	return err;
+}
+
+static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	list_del(&set->list);
+	if (!(set->flags & NFT_SET_ANONYMOUS))
+		nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);
+
+	set->ops->destroy(set);
+	module_put(set->ops->owner);
+	kfree(set);
+}
+
+static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
+			    const struct nlmsghdr *nlh,
+			    const struct nlattr * const nla[])
+{
+	struct nft_set *set;
+	struct nft_ctx ctx;
+	int err;
+
+	if (nla[NFTA_SET_TABLE] == NULL)
+		return -EINVAL;
+
+	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+	if (!list_empty(&set->bindings))
+		return -EBUSY;
+
+	nf_tables_set_destroy(&ctx, set);
+	return 0;
+}
+
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+					const struct nft_set *set,
+					const struct nft_set_iter *iter,
+					const struct nft_set_elem *elem)
+{
+	enum nft_registers dreg;
+
+	dreg = nft_type_to_reg(set->dtype);
+	return nft_validate_data_load(ctx, dreg, &elem->data, set->dtype);
+}
+
+int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
+		       struct nft_set_binding *binding)
+{
+	struct nft_set_binding *i;
+	struct nft_set_iter iter;
+
+	if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+		return -EBUSY;
+
+	if (set->flags & NFT_SET_MAP) {
+		/* If the set is already bound to the same chain all
+		 * jumps are already validated for that chain.
+		 */
+		list_for_each_entry(i, &set->bindings, list) {
+			if (i->chain == binding->chain)
+				goto bind;
+		}
+
+		iter.skip 	= 0;
+		iter.count	= 0;
+		iter.err	= 0;
+		iter.fn		= nf_tables_bind_check_setelem;
+
+		set->ops->walk(ctx, set, &iter);
+		if (iter.err < 0) {
+			/* Destroy anonymous sets if binding fails */
+			if (set->flags & NFT_SET_ANONYMOUS)
+				nf_tables_set_destroy(ctx, set);
+
+			return iter.err;
+		}
+	}
+bind:
+	binding->chain = ctx->chain;
+	list_add_tail(&binding->list, &set->bindings);
+	return 0;
+}
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+			  struct nft_set_binding *binding)
+{
+	list_del(&binding->list);
+
+	if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+		nf_tables_set_destroy(ctx, set);
+}
+
+/*
+ * Set elements
+ */
+
+static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
+	[NFTA_SET_ELEM_KEY]		= { .type = NLA_NESTED },
+	[NFTA_SET_ELEM_DATA]		= { .type = NLA_NESTED },
+	[NFTA_SET_ELEM_FLAGS]		= { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
+	[NFTA_SET_ELEM_LIST_TABLE]	= { .type = NLA_STRING },
+	[NFTA_SET_ELEM_LIST_SET]	= { .type = NLA_STRING },
+	[NFTA_SET_ELEM_LIST_ELEMENTS]	= { .type = NLA_NESTED },
+};
+
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+				      const struct sk_buff *skb,
+				      const struct nlmsghdr *nlh,
+				      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+
+	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	return 0;
+}
+
+static int nf_tables_fill_setelem(struct sk_buff *skb,
+				  const struct nft_set *set,
+				  const struct nft_set_elem *elem)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE,
+			  set->klen) < 0)
+		goto nla_put_failure;
+
+	if (set->flags & NFT_SET_MAP &&
+	    !(elem->flags & NFT_SET_ELEM_INTERVAL_END) &&
+	    nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data,
+			  set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
+			  set->dlen) < 0)
+		goto nla_put_failure;
+
+	if (elem->flags != 0)
+		if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags)))
+			goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+struct nft_set_dump_args {
+	const struct netlink_callback	*cb;
+	struct nft_set_iter		iter;
+	struct sk_buff			*skb;
+};
+
+static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
+				  const struct nft_set *set,
+				  const struct nft_set_iter *iter,
+				  const struct nft_set_elem *elem)
+{
+	struct nft_set_dump_args *args;
+
+	args = container_of(iter, struct nft_set_dump_args, iter);
+	return nf_tables_fill_setelem(args->skb, set, elem);
+}
+
+static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct nft_set *set;
+	struct nft_set_dump_args args;
+	struct nft_ctx ctx;
+	struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+	struct nfgenmsg *nfmsg;
+	struct nlmsghdr *nlh;
+	struct nlattr *nest;
+	u32 portid, seq;
+	int event, err;
+
+	nfmsg = nlmsg_data(cb->nlh);
+	err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_ELEM_LIST_MAX,
+			  nft_set_elem_list_policy);
+	if (err < 0)
+		return err;
+
+	err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	event  = NFT_MSG_NEWSETELEM;
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	portid = NETLINK_CB(cb->skb).portid;
+	seq    = cb->nlh->nlmsg_seq;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+			NLM_F_MULTI);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = NFPROTO_UNSPEC;
+	nfmsg->version      = NFNETLINK_V0;
+	nfmsg->res_id       = 0;
+
+	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	args.cb		= cb;
+	args.skb	= skb;
+	args.iter.skip	= cb->args[0];
+	args.iter.count	= 0;
+	args.iter.err   = 0;
+	args.iter.fn	= nf_tables_dump_setelem;
+	set->ops->walk(&ctx, set, &args.iter);
+
+	nla_nest_end(skb, nest);
+	nlmsg_end(skb, nlh);
+
+	if (args.iter.err && args.iter.err != -EMSGSIZE)
+		return args.iter.err;
+	if (args.iter.count == cb->args[0])
+		return 0;
+
+	cb->args[0] = args.iter.count;
+	return skb->len;
+
+nla_put_failure:
+	return -ENOSPC;
+}
+
+static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
+				const struct nlmsghdr *nlh,
+				const struct nlattr * const nla[])
+{
+	const struct nft_set *set;
+	struct nft_ctx ctx;
+	int err;
+
+	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_set,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+	return -EOPNOTSUPP;
+}
+
+static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,
+			    const struct nlattr *attr)
+{
+	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+	struct nft_data_desc d1, d2;
+	struct nft_set_elem elem;
+	struct nft_set_binding *binding;
+	enum nft_registers dreg;
+	int err;
+
+	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+			       nft_set_elem_policy);
+	if (err < 0)
+		return err;
+
+	if (nla[NFTA_SET_ELEM_KEY] == NULL)
+		return -EINVAL;
+
+	elem.flags = 0;
+	if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
+		elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
+		if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END)
+			return -EINVAL;
+	}
+
+	if (set->flags & NFT_SET_MAP) {
+		if (nla[NFTA_SET_ELEM_DATA] == NULL &&
+		    !(elem.flags & NFT_SET_ELEM_INTERVAL_END))
+			return -EINVAL;
+	} else {
+		if (nla[NFTA_SET_ELEM_DATA] != NULL)
+			return -EINVAL;
+	}
+
+	err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]);
+	if (err < 0)
+		goto err1;
+	err = -EINVAL;
+	if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
+		goto err2;
+
+	err = -EEXIST;
+	if (set->ops->get(set, &elem) == 0)
+		goto err2;
+
+	if (nla[NFTA_SET_ELEM_DATA] != NULL) {
+		err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]);
+		if (err < 0)
+			goto err2;
+
+		err = -EINVAL;
+		if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
+			goto err3;
+
+		dreg = nft_type_to_reg(set->dtype);
+		list_for_each_entry(binding, &set->bindings, list) {
+			struct nft_ctx bind_ctx = {
+				.afi	= ctx->afi,
+				.table	= ctx->table,
+				.chain	= binding->chain,
+			};
+
+			err = nft_validate_data_load(&bind_ctx, dreg,
+						     &elem.data, d2.type);
+			if (err < 0)
+				goto err3;
+		}
+	}
+
+	err = set->ops->insert(set, &elem);
+	if (err < 0)
+		goto err3;
+
+	return 0;
+
+err3:
+	if (nla[NFTA_SET_ELEM_DATA] != NULL)
+		nft_data_uninit(&elem.data, d2.type);
+err2:
+	nft_data_uninit(&elem.key, d1.type);
+err1:
+	return err;
+}
+
+static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
+				const struct nlmsghdr *nlh,
+				const struct nlattr * const nla[])
+{
+	const struct nlattr *attr;
+	struct nft_set *set;
+	struct nft_ctx ctx;
+	int rem, err;
+
+	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+		return -EBUSY;
+
+	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+		err = nft_add_set_elem(&ctx, set, attr);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
+static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set,
+			   const struct nlattr *attr)
+{
+	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+	struct nft_data_desc desc;
+	struct nft_set_elem elem;
+	int err;
+
+	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+			       nft_set_elem_policy);
+	if (err < 0)
+		goto err1;
+
+	err = -EINVAL;
+	if (nla[NFTA_SET_ELEM_KEY] == NULL)
+		goto err1;
+
+	err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]);
+	if (err < 0)
+		goto err1;
+
+	err = -EINVAL;
+	if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+		goto err2;
+
+	err = set->ops->get(set, &elem);
+	if (err < 0)
+		goto err2;
+
+	set->ops->remove(set, &elem);
+
+	nft_data_uninit(&elem.key, NFT_DATA_VALUE);
+	if (set->flags & NFT_SET_MAP)
+		nft_data_uninit(&elem.data, set->dtype);
+
+err2:
+	nft_data_uninit(&elem.key, desc.type);
+err1:
+	return err;
+}
+
+static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
+				const struct nlmsghdr *nlh,
+				const struct nlattr * const nla[])
+{
+	const struct nlattr *attr;
+	struct nft_set *set;
+	struct nft_ctx ctx;
+	int rem, err;
+
+	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+		return -EBUSY;
+
+	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+		err = nft_del_setelem(&ctx, set, attr);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
 static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	[NFT_MSG_NEWTABLE] = {
 		.call		= nf_tables_newtable,
@@ -1438,6 +2377,36 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
+	[NFT_MSG_NEWSET] = {
+		.call		= nf_tables_newset,
+		.attr_count	= NFTA_SET_MAX,
+		.policy		= nft_set_policy,
+	},
+	[NFT_MSG_GETSET] = {
+		.call		= nf_tables_getset,
+		.attr_count	= NFTA_SET_MAX,
+		.policy		= nft_set_policy,
+	},
+	[NFT_MSG_DELSET] = {
+		.call		= nf_tables_delset,
+		.attr_count	= NFTA_SET_MAX,
+		.policy		= nft_set_policy,
+	},
+	[NFT_MSG_NEWSETELEM] = {
+		.call		= nf_tables_newsetelem,
+		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
+		.policy		= nft_set_elem_list_policy,
+	},
+	[NFT_MSG_GETSETELEM] = {
+		.call		= nf_tables_getsetelem,
+		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
+		.policy		= nft_set_elem_list_policy,
+	},
+	[NFT_MSG_DELSETELEM] = {
+		.call		= nf_tables_delsetelem,
+		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
+		.policy		= nft_set_elem_list_policy,
+	},
 };
 
 static const struct nfnetlink_subsystem nf_tables_subsys = {
@@ -1447,6 +2416,90 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 };
 
+/*
+ * Loop detection - walk through the ruleset beginning at the destination chain
+ * of a new jump until either the source chain is reached (loop) or all
+ * reachable chains have been traversed.
+ *
+ * The loop check is performed whenever a new jump verdict is added to an
+ * expression or verdict map or a verdict map is bound to a new chain.
+ */
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+				 const struct nft_chain *chain);
+
+static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+					const struct nft_set *set,
+					const struct nft_set_iter *iter,
+					const struct nft_set_elem *elem)
+{
+	switch (elem->data.verdict) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		return nf_tables_check_loops(ctx, elem->data.chain);
+	default:
+		return 0;
+	}
+}
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+				 const struct nft_chain *chain)
+{
+	const struct nft_rule *rule;
+	const struct nft_expr *expr, *last;
+	const struct nft_data *data;
+	const struct nft_set *set;
+	struct nft_set_binding *binding;
+	struct nft_set_iter iter;
+	int err;
+
+	if (ctx->chain == chain)
+		return -ELOOP;
+
+	list_for_each_entry(rule, &chain->rules, list) {
+		nft_rule_for_each_expr(expr, last, rule) {
+			if (!expr->ops->get_verdict)
+				continue;
+
+			data = expr->ops->get_verdict(expr);
+			if (data == NULL)
+				break;
+
+			switch (data->verdict) {
+			case NFT_JUMP:
+			case NFT_GOTO:
+				err = nf_tables_check_loops(ctx, data->chain);
+				if (err < 0)
+					return err;
+			default:
+				break;
+			}
+		}
+	}
+
+	list_for_each_entry(set, &ctx->table->sets, list) {
+		if (!(set->flags & NFT_SET_MAP) ||
+		    set->dtype != NFT_DATA_VERDICT)
+			continue;
+
+		list_for_each_entry(binding, &set->bindings, list) {
+			if (binding->chain != chain)
+				continue;
+
+			iter.skip 	= 0;
+			iter.count	= 0;
+			iter.err	= 0;
+			iter.fn		= nf_tables_loop_check_setelem;
+
+			set->ops->walk(ctx, set, &iter);
+			if (iter.err < 0)
+				return iter.err;
+		}
+	}
+
+	return 0;
+}
+
 /**
  *	nft_validate_input_register - validate an expressions' input register
  *
@@ -1500,11 +2553,25 @@ int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
 			   const struct nft_data *data,
 			   enum nft_data_types type)
 {
+	int err;
+
 	switch (reg) {
 	case NFT_REG_VERDICT:
 		if (data == NULL || type != NFT_DATA_VERDICT)
 			return -EINVAL;
-		// FIXME: do loop detection
+
+		if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) {
+			err = nf_tables_check_loops(ctx, data->chain);
+			if (err < 0)
+				return err;
+
+			if (ctx->chain->level + 1 > data->chain->level) {
+				if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
+					return -EMLINK;
+				data->chain->level = ctx->chain->level + 1;
+			}
+		}
+
 		return 0;
 	default:
 		if (data != NULL && type != NFT_DATA_VALUE)
@@ -1555,11 +2622,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 		if (chain->flags & NFT_BASE_CHAIN)
 			return -EOPNOTSUPP;
 
-		if (ctx->chain->level + 1 > chain->level) {
-			if (ctx->chain->level + 1 == 16)
-				return -EMLINK;
-			chain->level = ctx->chain->level + 1;
-		}
 		chain->use++;
 		data->chain = chain;
 		desc->len = sizeof(data);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index bc7fb85d4002..fd0ecd3255c1 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -20,8 +20,6 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 
-#define NFT_JUMP_STACK_SIZE	16
-
 unsigned int nft_do_chain(const struct nf_hook_ops *ops,
 			  struct sk_buff *skb,
 			  const struct net_device *in,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 67cc502881f1..3d3f8fce10a5 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,11 +21,6 @@
 struct nft_hash {
 	struct hlist_head	*hash;
 	unsigned int		hsize;
-	enum nft_registers	sreg:8;
-	enum nft_registers	dreg:8;
-	u8			klen;
-	u8			dlen;
-	u16			flags;
 };
 
 struct nft_hash_elem {
@@ -42,213 +37,140 @@ static unsigned int nft_hash_data(const struct nft_data *data,
 {
 	unsigned int h;
 
-	// FIXME: can we reasonably guarantee the upper bits are fixed?
-	h = jhash2(data->data, len >> 2, nft_hash_rnd);
+	h = jhash(data->data, len, nft_hash_rnd);
 	return ((u64)h * hsize) >> 32;
 }
 
-static void nft_hash_eval(const struct nft_expr *expr,
-			  struct nft_data data[NFT_REG_MAX + 1],
-			  const struct nft_pktinfo *pkt)
+static bool nft_hash_lookup(const struct nft_set *set,
+			    const struct nft_data *key,
+			    struct nft_data *data)
 {
-	const struct nft_hash *priv = nft_expr_priv(expr);
-	const struct nft_hash_elem *elem;
-	const struct nft_data *key = &data[priv->sreg];
+	const struct nft_hash *priv = nft_set_priv(set);
+	const struct nft_hash_elem *he;
 	unsigned int h;
 
-	h = nft_hash_data(key, priv->hsize, priv->klen);
-	hlist_for_each_entry(elem, &priv->hash[h], hnode) {
-		if (nft_data_cmp(&elem->key, key, priv->klen))
+	h = nft_hash_data(key, priv->hsize, set->klen);
+	hlist_for_each_entry(he, &priv->hash[h], hnode) {
+		if (nft_data_cmp(&he->key, key, set->klen))
 			continue;
-		if (priv->flags & NFT_HASH_MAP)
-			nft_data_copy(&data[priv->dreg], elem->data);
-		return;
+		if (set->flags & NFT_SET_MAP)
+			nft_data_copy(data, he->data);
+		return true;
 	}
-	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+	return false;
 }
 
-static void nft_hash_elem_destroy(const struct nft_expr *expr,
-				  struct nft_hash_elem *elem)
+static void nft_hash_elem_destroy(const struct nft_set *set,
+				  struct nft_hash_elem *he)
 {
-	const struct nft_hash *priv = nft_expr_priv(expr);
-
-	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
-	if (priv->flags & NFT_HASH_MAP)
-		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
-	kfree(elem);
+	nft_data_uninit(&he->key, NFT_DATA_VALUE);
+	if (set->flags & NFT_SET_MAP)
+		nft_data_uninit(he->data, set->dtype);
+	kfree(he);
 }
 
-static const struct nla_policy nft_he_policy[NFTA_HE_MAX + 1] = {
-	[NFTA_HE_KEY]		= { .type = NLA_NESTED },
-	[NFTA_HE_DATA]		= { .type = NLA_NESTED },
-};
-
-static int nft_hash_elem_init(const struct nft_ctx *ctx,
-			      const struct nft_expr *expr,
-			      const struct nlattr *nla,
-			      struct nft_hash_elem **new)
+static int nft_hash_insert(const struct nft_set *set,
+			   const struct nft_set_elem *elem)
 {
-	struct nft_hash *priv = nft_expr_priv(expr);
-	struct nlattr *tb[NFTA_HE_MAX + 1];
-	struct nft_hash_elem *elem;
-	struct nft_data_desc d1, d2;
-	unsigned int size;
-	int err;
+	struct nft_hash *priv = nft_set_priv(set);
+	struct nft_hash_elem *he;
+	unsigned int size, h;
 
-	err = nla_parse_nested(tb, NFTA_HE_MAX, nla, nft_he_policy);
-	if (err < 0)
-		return err;
-
-	if (tb[NFTA_HE_KEY] == NULL)
+	if (elem->flags != 0)
 		return -EINVAL;
-	size = sizeof(*elem);
-
-	if (priv->flags & NFT_HASH_MAP) {
-		if (tb[NFTA_HE_DATA] == NULL)
-			return -EINVAL;
-		size += sizeof(elem->data[0]);
-	} else {
-		if (tb[NFTA_HE_DATA] != NULL)
-			return -EINVAL;
-	}
 
-	elem = kzalloc(size, GFP_KERNEL);
-	if (elem == NULL)
+	size = sizeof(*he);
+	if (set->flags & NFT_SET_MAP)
+		size += sizeof(he->data[0]);
+
+	he = kzalloc(size, GFP_KERNEL);
+	if (he == NULL)
 		return -ENOMEM;
 
-	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_HE_KEY]);
-	if (err < 0)
-		goto err1;
-	err = -EINVAL;
-	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
-		goto err2;
-
-	if (tb[NFTA_HE_DATA] != NULL) {
-		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_HE_DATA]);
-		if (err < 0)
-			goto err2;
-		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
-		if (err < 0)
-			goto err3;
-	}
+	nft_data_copy(&he->key, &elem->key);
+	if (set->flags & NFT_SET_MAP)
+		nft_data_copy(he->data, &elem->data);
 
-	*new = elem;
+	h = nft_hash_data(&he->key, priv->hsize, set->klen);
+	hlist_add_head_rcu(&he->hnode, &priv->hash[h]);
 	return 0;
-
-err3:
-	nft_data_uninit(elem->data, d2.type);
-err2:
-	nft_data_uninit(&elem->key, d1.type);
-err1:
-	kfree(elem);
-	return err;
 }
 
-static int nft_hash_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
-			      const struct nft_hash_elem *elem)
-
+static void nft_hash_remove(const struct nft_set *set,
+			    const struct nft_set_elem *elem)
 {
-	const struct nft_hash *priv = nft_expr_priv(expr);
-	struct nlattr *nest;
+	struct nft_hash_elem *he = elem->cookie;
 
-	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (nft_data_dump(skb, NFTA_HE_KEY, &elem->key,
-			  NFT_DATA_VALUE, priv->klen) < 0)
-		goto nla_put_failure;
+	hlist_del_rcu(&he->hnode);
+	kfree(he);
+}
 
-	if (priv->flags & NFT_HASH_MAP) {
-		if (nft_data_dump(skb, NFTA_HE_DATA, elem->data,
-				  NFT_DATA_VALUE, priv->dlen) < 0)
-			goto nla_put_failure;
-	}
+static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
+{
+	const struct nft_hash *priv = nft_set_priv(set);
+	struct nft_hash_elem *he;
+	unsigned int h;
 
-	nla_nest_end(skb, nest);
-	return 0;
+	h = nft_hash_data(&elem->key, priv->hsize, set->klen);
+	hlist_for_each_entry(he, &priv->hash[h], hnode) {
+		if (nft_data_cmp(&he->key, &elem->key, set->klen))
+			continue;
 
-nla_put_failure:
-	return -1;
+		elem->cookie = he;
+		elem->flags  = 0;
+		if (set->flags & NFT_SET_MAP)
+			nft_data_copy(&elem->data, he->data);
+		return 0;
+	}
+	return -ENOENT;
 }
 
-static void nft_hash_destroy(const struct nft_ctx *ctx,
-			     const struct nft_expr *expr)
+static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+			  struct nft_set_iter *iter)
 {
-	const struct nft_hash *priv = nft_expr_priv(expr);
-	const struct hlist_node *next;
-	struct nft_hash_elem *elem;
+	const struct nft_hash *priv = nft_set_priv(set);
+	const struct nft_hash_elem *he;
+	struct nft_set_elem elem;
 	unsigned int i;
 
 	for (i = 0; i < priv->hsize; i++) {
-		hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
-			hlist_del(&elem->hnode);
-			nft_hash_elem_destroy(expr, elem);
+		hlist_for_each_entry(he, &priv->hash[i], hnode) {
+			if (iter->count < iter->skip)
+				goto cont;
+
+			memcpy(&elem.key, &he->key, sizeof(elem.key));
+			if (set->flags & NFT_SET_MAP)
+				memcpy(&elem.data, he->data, sizeof(elem.data));
+			elem.flags = 0;
+
+			iter->err = iter->fn(ctx, set, iter, &elem);
+			if (iter->err < 0)
+				return;
+cont:
+			iter->count++;
 		}
 	}
-	kfree(priv->hash);
 }
 
-static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
-	[NFTA_HASH_FLAGS]	= { .type = NLA_U32 },
-	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
-	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
-	[NFTA_HASH_KLEN]	= { .type = NLA_U32 },
-	[NFTA_HASH_ELEMENTS]	= { .type = NLA_NESTED },
-};
+static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
+{
+	return sizeof(struct nft_hash);
+}
 
-static int nft_hash_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+static int nft_hash_init(const struct nft_set *set,
 			 const struct nlattr * const tb[])
 {
-	struct nft_hash *priv = nft_expr_priv(expr);
-	struct nft_hash_elem *elem, *uninitialized_var(new);
-	const struct nlattr *nla;
+	struct nft_hash *priv = nft_set_priv(set);
 	unsigned int cnt, i;
-	unsigned int h;
-	int err, rem;
 
 	if (unlikely(!nft_hash_rnd_initted)) {
 		get_random_bytes(&nft_hash_rnd, 4);
 		nft_hash_rnd_initted = true;
 	}
 
-	if (tb[NFTA_HASH_SREG] == NULL ||
-	    tb[NFTA_HASH_KLEN] == NULL ||
-	    tb[NFTA_HASH_ELEMENTS] == NULL)
-		return -EINVAL;
-
-	if (tb[NFTA_HASH_FLAGS] != NULL) {
-		priv->flags = ntohl(nla_get_be32(tb[NFTA_HASH_FLAGS]));
-		if (priv->flags & ~NFT_HASH_MAP)
-			return -EINVAL;
-	}
-
-	priv->sreg = ntohl(nla_get_be32(tb[NFTA_HASH_SREG]));
-	err = nft_validate_input_register(priv->sreg);
-	if (err < 0)
-		return err;
-
-	if (tb[NFTA_HASH_DREG] != NULL) {
-		if (!(priv->flags & NFT_HASH_MAP))
-			return -EINVAL;
-		priv->dreg = ntohl(nla_get_be32(tb[NFTA_HASH_DREG]));
-		err = nft_validate_output_register(priv->dreg);
-		if (err < 0)
-			return err;
-	}
-
-	priv->klen = ntohl(nla_get_be32(tb[NFTA_HASH_KLEN]));
-	if (priv->klen == 0)
-		return -EINVAL;
-
-	cnt = 0;
-	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
-		if (nla_type(nla) != NFTA_LIST_ELEM)
-			return -EINVAL;
-		cnt++;
-	}
-
 	/* Aim for a load factor of 0.75 */
+	// FIXME: temporarily broken until we have set descriptions
+	cnt = 100;
 	cnt = cnt * 4 / 3;
 
 	priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
@@ -259,85 +181,46 @@ static int nft_hash_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	for (i = 0; i < cnt; i++)
 		INIT_HLIST_HEAD(&priv->hash[i]);
 
-	err = -ENOMEM;
-	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
-		err = nft_hash_elem_init(ctx, expr, nla, &new);
-		if (err < 0)
-			goto err1;
-
-		h = nft_hash_data(&new->key, priv->hsize, priv->klen);
-		hlist_for_each_entry(elem, &priv->hash[h], hnode) {
-			if (nft_data_cmp(&elem->key, &new->key, priv->klen))
-				continue;
-			nft_hash_elem_destroy(expr, new);
-			err = -EEXIST;
-			goto err1;
-		}
-		hlist_add_head(&new->hnode, &priv->hash[h]);
-	}
 	return 0;
-
-err1:
-	nft_hash_destroy(ctx, expr);
-	return err;
 }
 
-static int nft_hash_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void nft_hash_destroy(const struct nft_set *set)
 {
-	const struct nft_hash *priv = nft_expr_priv(expr);
-	const struct nft_hash_elem *elem;
-	struct nlattr *list;
+	const struct nft_hash *priv = nft_set_priv(set);
+	const struct hlist_node *next;
+	struct nft_hash_elem *elem;
 	unsigned int i;
 
-	if (priv->flags)
-		if (nla_put_be32(skb, NFTA_HASH_FLAGS, htonl(priv->flags)))
-			goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_HASH_SREG, htonl(priv->sreg)))
-		goto nla_put_failure;
-	if (priv->flags & NFT_HASH_MAP)
-		if (nla_put_be32(skb, NFTA_HASH_DREG, htonl(priv->dreg)))
-			goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_HASH_KLEN, htonl(priv->klen)))
-		goto nla_put_failure;
-
-	list = nla_nest_start(skb, NFTA_HASH_ELEMENTS);
-	if (list == NULL)
-		goto nla_put_failure;
-
 	for (i = 0; i < priv->hsize; i++) {
-		hlist_for_each_entry(elem, &priv->hash[i], hnode) {
-			if (nft_hash_elem_dump(skb, expr, elem) < 0)
-				goto nla_put_failure;
+		hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
+			hlist_del(&elem->hnode);
+			nft_hash_elem_destroy(set, elem);
 		}
 	}
-
-	nla_nest_end(skb, list);
-	return 0;
-
-nla_put_failure:
-	return -1;
+	kfree(priv->hash);
 }
 
-static struct nft_expr_ops nft_hash_ops __read_mostly = {
-	.name		= "hash",
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_hash)),
-	.owner		= THIS_MODULE,
-	.eval		= nft_hash_eval,
+static struct nft_set_ops nft_hash_ops __read_mostly = {
+	.privsize       = nft_hash_privsize,
 	.init		= nft_hash_init,
 	.destroy	= nft_hash_destroy,
-	.dump		= nft_hash_dump,
-	.policy		= nft_hash_policy,
-	.maxattr	= NFTA_HASH_MAX,
+	.get		= nft_hash_get,
+	.insert		= nft_hash_insert,
+	.remove		= nft_hash_remove,
+	.lookup		= nft_hash_lookup,
+	.walk		= nft_hash_walk,
+	.features	= NFT_SET_MAP,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_hash_module_init(void)
 {
-	return nft_register_expr(&nft_hash_ops);
+	return nft_register_set(&nft_hash_ops);
 }
 
 static void __exit nft_hash_module_exit(void)
 {
-	nft_unregister_expr(&nft_hash_ops);
+	nft_unregister_set(&nft_hash_ops);
 }
 
 module_init(nft_hash_module_init);
@@ -345,4 +228,4 @@ module_exit(nft_hash_module_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("hash");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 3bf42c3cc49a..78334bf37007 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -90,6 +90,16 @@ nla_put_failure:
 	return -1;
 }
 
+static const struct nft_data *nft_immediate_get_verdict(const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	if (priv->dreg == NFT_REG_VERDICT)
+		return &priv->data;
+	else
+		return NULL;
+}
+
 static struct nft_expr_ops nft_imm_ops __read_mostly = {
 	.name		= "immediate",
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -98,6 +108,7 @@ static struct nft_expr_ops nft_imm_ops __read_mostly = {
 	.init		= nft_immediate_init,
 	.destroy	= nft_immediate_destroy,
 	.dump		= nft_immediate_dump,
+	.get_verdict	= nft_immediate_get_verdict,
 	.policy		= nft_immediate_policy,
 	.maxattr	= NFTA_IMMEDIATE_MAX,
 };
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
new file mode 100644
index 000000000000..4962d2173678
--- /dev/null
+++ b/net/netfilter/nft_lookup.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_lookup {
+	struct nft_set			*set;
+	enum nft_registers		sreg:8;
+	enum nft_registers		dreg:8;
+	struct nft_set_binding		binding;
+};
+
+static void nft_lookup_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_lookup *priv = nft_expr_priv(expr);
+	const struct nft_set *set = priv->set;
+
+	if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg]))
+		return;
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
+	[NFTA_LOOKUP_SET]	= { .type = NLA_STRING },
+	[NFTA_LOOKUP_SREG]	= { .type = NLA_U32 },
+	[NFTA_LOOKUP_DREG]	= { .type = NLA_U32 },
+};
+
+static int nft_lookup_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_lookup *priv = nft_expr_priv(expr);
+	struct nft_set *set;
+	int err;
+
+	if (tb[NFTA_LOOKUP_SET] == NULL ||
+	    tb[NFTA_LOOKUP_SREG] == NULL)
+		return -EINVAL;
+
+	set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_LOOKUP_DREG] != NULL) {
+		if (!(set->flags & NFT_SET_MAP))
+			return -EINVAL;
+
+		priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG]));
+		err = nft_validate_output_register(priv->dreg);
+		if (err < 0)
+			return err;
+
+		if (priv->dreg == NFT_REG_VERDICT) {
+			if (set->dtype != NFT_DATA_VERDICT)
+				return -EINVAL;
+		} else if (set->dtype == NFT_DATA_VERDICT)
+			return -EINVAL;
+	} else if (set->flags & NFT_SET_MAP)
+		return -EINVAL;
+
+	err = nf_tables_bind_set(ctx, set, &priv->binding);
+	if (err < 0)
+		return err;
+
+	priv->set = set;
+	return 0;
+}
+
+static void nft_lookup_destroy(const struct nft_expr *expr)
+{
+	struct nft_lookup *priv = nft_expr_priv(expr);
+
+	nf_tables_unbind_set(NULL, priv->set, &priv->binding);
+}
+
+static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_lookup *priv = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (priv->set->flags & NFT_SET_MAP)
+		if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg)))
+			goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_lookup_ops __read_mostly = {
+	.name		= "lookup",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_lookup_eval,
+	.init		= nft_lookup_init,
+	.destroy	= nft_lookup_destroy,
+	.dump		= nft_lookup_dump,
+	.policy		= nft_lookup_policy,
+	.maxattr	= NFTA_LOOKUP_MAX,
+};
+
+int __init nft_lookup_module_init(void)
+{
+	return nft_register_expr(&nft_lookup_ops);
+}
+
+void nft_lookup_module_exit(void)
+{
+	nft_unregister_expr(&nft_lookup_ops);
+}
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
new file mode 100644
index 000000000000..ca0c1b231bfe
--- /dev/null
+++ b/net/netfilter/nft_rbtree.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_rbtree {
+	struct rb_root		root;
+};
+
+struct nft_rbtree_elem {
+	struct rb_node		node;
+	u16			flags;
+	struct nft_data		key;
+	struct nft_data		data[];
+};
+
+static bool nft_rbtree_lookup(const struct nft_set *set,
+			      const struct nft_data *key,
+			      struct nft_data *data)
+{
+	const struct nft_rbtree *priv = nft_set_priv(set);
+	const struct nft_rbtree_elem *rbe, *interval = NULL;
+	const struct rb_node *parent = priv->root.rb_node;
+	int d;
+
+	while (parent != NULL) {
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+		d = nft_data_cmp(&rbe->key, key, set->klen);
+		if (d < 0) {
+			parent = parent->rb_left;
+			interval = rbe;
+		} else if (d > 0)
+			parent = parent->rb_right;
+		else {
+found:
+			if (rbe->flags & NFT_SET_ELEM_INTERVAL_END)
+				goto out;
+			if (set->flags & NFT_SET_MAP)
+				nft_data_copy(data, rbe->data);
+			return true;
+		}
+	}
+
+	if (set->flags & NFT_SET_INTERVAL && interval != NULL) {
+		rbe = interval;
+		goto found;
+	}
+out:
+	return false;
+}
+
+static void nft_rbtree_elem_destroy(const struct nft_set *set,
+				    struct nft_rbtree_elem *rbe)
+{
+	nft_data_uninit(&rbe->key, NFT_DATA_VALUE);
+	if (set->flags & NFT_SET_MAP)
+		nft_data_uninit(rbe->data, set->dtype);
+	kfree(rbe);
+}
+
+static int __nft_rbtree_insert(const struct nft_set *set,
+			       struct nft_rbtree_elem *new)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	struct nft_rbtree_elem *rbe;
+	struct rb_node *parent, **p;
+	int d;
+
+	parent = NULL;
+	p = &priv->root.rb_node;
+	while (*p != NULL) {
+		parent = *p;
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+		d = nft_data_cmp(&rbe->key, &new->key, set->klen);
+		if (d < 0)
+			p = &parent->rb_left;
+		else if (d > 0)
+			p = &parent->rb_right;
+		else
+			return -EEXIST;
+	}
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &priv->root);
+	return 0;
+}
+
+static int nft_rbtree_insert(const struct nft_set *set,
+			     const struct nft_set_elem *elem)
+{
+	struct nft_rbtree_elem *rbe;
+	unsigned int size;
+	int err;
+
+	size = sizeof(*rbe);
+	if (set->flags & NFT_SET_MAP)
+		size += sizeof(rbe->data[0]);
+
+	rbe = kzalloc(size, GFP_KERNEL);
+	if (rbe == NULL)
+		return -ENOMEM;
+
+	rbe->flags = elem->flags;
+	nft_data_copy(&rbe->key, &elem->key);
+	if (set->flags & NFT_SET_MAP)
+		nft_data_copy(rbe->data, &elem->data);
+
+	err = __nft_rbtree_insert(set, rbe);
+	if (err < 0)
+		kfree(rbe);
+	return err;
+}
+
+static void nft_rbtree_remove(const struct nft_set *set,
+			      const struct nft_set_elem *elem)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	struct nft_rbtree_elem *rbe = elem->cookie;
+
+	rb_erase(&rbe->node, &priv->root);
+	kfree(rbe);
+}
+
+static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem)
+{
+	const struct nft_rbtree *priv = nft_set_priv(set);
+	const struct rb_node *parent = priv->root.rb_node;
+	struct nft_rbtree_elem *rbe;
+	int d;
+
+	while (parent != NULL) {
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+		d = nft_data_cmp(&rbe->key, &elem->key, set->klen);
+		if (d < 0)
+			parent = parent->rb_left;
+		else if (d > 0)
+			parent = parent->rb_right;
+		else {
+			elem->cookie = rbe;
+			if (set->flags & NFT_SET_MAP)
+				nft_data_copy(&elem->data, rbe->data);
+			elem->flags = rbe->flags;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+			    const struct nft_set *set,
+			    struct nft_set_iter *iter)
+{
+	const struct nft_rbtree *priv = nft_set_priv(set);
+	const struct nft_rbtree_elem *rbe;
+	struct nft_set_elem elem;
+	struct rb_node *node;
+
+	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+		if (iter->count < iter->skip)
+			goto cont;
+
+		rbe = rb_entry(node, struct nft_rbtree_elem, node);
+		nft_data_copy(&elem.key, &rbe->key);
+		if (set->flags & NFT_SET_MAP)
+			nft_data_copy(&elem.data, rbe->data);
+		elem.flags = rbe->flags;
+
+		iter->err = iter->fn(ctx, set, iter, &elem);
+		if (iter->err < 0)
+			return;
+cont:
+		iter->count++;
+	}
+}
+
+static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
+{
+	return sizeof(struct nft_rbtree);
+}
+
+static int nft_rbtree_init(const struct nft_set *set,
+			   const struct nlattr * const nla[])
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+
+	priv->root = RB_ROOT;
+	return 0;
+}
+
+static void nft_rbtree_destroy(const struct nft_set *set)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	struct nft_rbtree_elem *rbe;
+	struct rb_node *node;
+
+	while ((node = priv->root.rb_node) != NULL) {
+		rb_erase(node, &priv->root);
+		rbe = rb_entry(node, struct nft_rbtree_elem, node);
+		nft_rbtree_elem_destroy(set, rbe);
+	}
+}
+
+static struct nft_set_ops nft_rbtree_ops __read_mostly = {
+	.privsize	= nft_rbtree_privsize,
+	.init		= nft_rbtree_init,
+	.destroy	= nft_rbtree_destroy,
+	.insert		= nft_rbtree_insert,
+	.remove		= nft_rbtree_remove,
+	.get		= nft_rbtree_get,
+	.lookup		= nft_rbtree_lookup,
+	.walk		= nft_rbtree_walk,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_rbtree_module_init(void)
+{
+	return nft_register_set(&nft_rbtree_ops);
+}
+
+static void __exit nft_rbtree_module_exit(void)
+{
+	nft_unregister_set(&nft_rbtree_ops);
+}
+
+module_init(nft_rbtree_module_init);
+module_exit(nft_rbtree_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set.c b/net/netfilter/nft_set.c
deleted file mode 100644
index 7b7c8354c327..000000000000
--- a/net/netfilter/nft_set.c
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/rbtree.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-
-struct nft_set {
-	struct rb_root		root;
-	enum nft_registers	sreg:8;
-	enum nft_registers	dreg:8;
-	u8			klen;
-	u8			dlen;
-	u16			flags;
-};
-
-struct nft_set_elem {
-	struct rb_node		node;
-	enum nft_set_elem_flags	flags;
-	struct nft_data		key;
-	struct nft_data		data[];
-};
-
-static void nft_set_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_set *priv = nft_expr_priv(expr);
-	const struct rb_node *parent = priv->root.rb_node;
-	const struct nft_set_elem *elem, *interval = NULL;
-	const struct nft_data *key = &data[priv->sreg];
-	int d;
-
-	while (parent != NULL) {
-		elem = rb_entry(parent, struct nft_set_elem, node);
-
-		d = nft_data_cmp(&elem->key, key, priv->klen);
-		if (d < 0) {
-			parent = parent->rb_left;
-			interval = elem;
-		} else if (d > 0)
-			parent = parent->rb_right;
-		else {
-found:
-			if (elem->flags & NFT_SE_INTERVAL_END)
-				goto out;
-			if (priv->flags & NFT_SET_MAP)
-				nft_data_copy(&data[priv->dreg], elem->data);
-			return;
-		}
-	}
-
-	if (priv->flags & NFT_SET_INTERVAL && interval != NULL) {
-		elem = interval;
-		goto found;
-	}
-out:
-	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
-}
-
-static void nft_set_elem_destroy(const struct nft_expr *expr,
-				 struct nft_set_elem *elem)
-{
-	const struct nft_set *priv = nft_expr_priv(expr);
-
-	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
-	if (priv->flags & NFT_SET_MAP)
-		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
-	kfree(elem);
-}
-
-static const struct nla_policy nft_se_policy[NFTA_SE_MAX + 1] = {
-	[NFTA_SE_KEY]		= { .type = NLA_NESTED },
-	[NFTA_SE_DATA]		= { .type = NLA_NESTED },
-	[NFTA_SE_FLAGS]		= { .type = NLA_U32 },
-};
-
-static int nft_set_elem_init(const struct nft_ctx *ctx,
-			     const struct nft_expr *expr,
-			     const struct nlattr *nla,
-			     struct nft_set_elem **new)
-{
-	struct nft_set *priv = nft_expr_priv(expr);
-	struct nlattr *tb[NFTA_SE_MAX + 1];
-	struct nft_set_elem *elem;
-	struct nft_data_desc d1, d2;
-	enum nft_set_elem_flags flags = 0;
-	unsigned int size;
-	int err;
-
-	err = nla_parse_nested(tb, NFTA_SE_MAX, nla, nft_se_policy);
-	if (err < 0)
-		return err;
-
-	if (tb[NFTA_SE_KEY] == NULL)
-		return -EINVAL;
-
-	if (tb[NFTA_SE_FLAGS] != NULL) {
-		flags = ntohl(nla_get_be32(tb[NFTA_SE_FLAGS]));
-		if (flags & ~NFT_SE_INTERVAL_END)
-			return -EINVAL;
-	}
-
-	size = sizeof(*elem);
-	if (priv->flags & NFT_SET_MAP) {
-		if (tb[NFTA_SE_DATA] == NULL && !(flags & NFT_SE_INTERVAL_END))
-			return -EINVAL;
-		size += sizeof(elem->data[0]);
-	} else {
-		if (tb[NFTA_SE_DATA] != NULL)
-			return -EINVAL;
-	}
-
-	elem = kzalloc(size, GFP_KERNEL);
-	if (elem == NULL)
-		return -ENOMEM;
-	elem->flags = flags;
-
-	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_SE_KEY]);
-	if (err < 0)
-		goto err1;
-	err = -EINVAL;
-	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
-		goto err2;
-
-	if (tb[NFTA_SE_DATA] != NULL) {
-		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_SE_DATA]);
-		if (err < 0)
-			goto err2;
-		err = -EINVAL;
-		if (priv->dreg != NFT_REG_VERDICT && d2.len != priv->dlen)
-			goto err2;
-		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
-		if (err < 0)
-			goto err3;
-	}
-
-	*new = elem;
-	return 0;
-
-err3:
-	nft_data_uninit(elem->data, d2.type);
-err2:
-	nft_data_uninit(&elem->key, d1.type);
-err1:
-	kfree(elem);
-	return err;
-}
-
-static int nft_set_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
-			     const struct nft_set_elem *elem)
-
-{
-	const struct nft_set *priv = nft_expr_priv(expr);
-	struct nlattr *nest;
-
-	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (nft_data_dump(skb, NFTA_SE_KEY, &elem->key,
-			  NFT_DATA_VALUE, priv->klen) < 0)
-		goto nla_put_failure;
-
-	if (priv->flags & NFT_SET_MAP && !(elem->flags & NFT_SE_INTERVAL_END)) {
-		if (nft_data_dump(skb, NFTA_SE_DATA, elem->data,
-				  nft_dreg_to_type(priv->dreg), priv->dlen) < 0)
-			goto nla_put_failure;
-	}
-
-	if (elem->flags){
-		if (nla_put_be32(skb, NFTA_SE_FLAGS, htonl(elem->flags)))
-			goto nla_put_failure;
-	}
-
-	nla_nest_end(skb, nest);
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static void nft_set_destroy(const struct nft_expr *expr)
-{
-	struct nft_set *priv = nft_expr_priv(expr);
-	struct nft_set_elem *elem;
-	struct rb_node *node;
-
-	while ((node = priv->root.rb_node) != NULL) {
-		rb_erase(node, &priv->root);
-		elem = rb_entry(node, struct nft_set_elem, node);
-		nft_set_elem_destroy(expr, elem);
-	}
-}
-
-static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
-	[NFTA_SET_FLAGS]	= { .type = NLA_U32 },
-	[NFTA_SET_SREG]		= { .type = NLA_U32 },
-	[NFTA_SET_DREG]		= { .type = NLA_U32 },
-	[NFTA_SET_KLEN]		= { .type = NLA_U32 },
-	[NFTA_SET_DLEN]		= { .type = NLA_U32 },
-	[NFTA_SET_ELEMENTS]	= { .type = NLA_NESTED },
-};
-
-static int nft_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_set *priv = nft_expr_priv(expr);
-	struct nft_set_elem *elem, *uninitialized_var(new);
-	struct rb_node *parent, **p;
-	const struct nlattr *nla;
-	int err, rem, d;
-
-	if (tb[NFTA_SET_SREG] == NULL ||
-	    tb[NFTA_SET_KLEN] == NULL ||
-	    tb[NFTA_SET_ELEMENTS] == NULL)
-		return -EINVAL;
-
-	priv->root = RB_ROOT;
-
-	if (tb[NFTA_SET_FLAGS] != NULL) {
-		priv->flags = ntohl(nla_get_be32(tb[NFTA_SET_FLAGS]));
-		if (priv->flags & ~(NFT_SET_INTERVAL | NFT_SET_MAP))
-			return -EINVAL;
-	}
-
-	priv->sreg = ntohl(nla_get_be32(tb[NFTA_SET_SREG]));
-	err = nft_validate_input_register(priv->sreg);
-	if (err < 0)
-		return err;
-
-	if (tb[NFTA_SET_DREG] != NULL) {
-		if (!(priv->flags & NFT_SET_MAP))
-			return -EINVAL;
-		if (tb[NFTA_SET_DLEN] == NULL)
-			return -EINVAL;
-
-		priv->dreg = ntohl(nla_get_be32(tb[NFTA_SET_DREG]));
-		err = nft_validate_output_register(priv->dreg);
-		if (err < 0)
-			return err;
-
-		if (priv->dreg == NFT_REG_VERDICT)
-			priv->dlen = FIELD_SIZEOF(struct nft_data, data);
-		else {
-			priv->dlen = ntohl(nla_get_be32(tb[NFTA_SET_DLEN]));
-			if (priv->dlen == 0 ||
-			    priv->dlen > FIELD_SIZEOF(struct nft_data, data))
-				return -EINVAL;
-		}
-	} else {
-		if (priv->flags & NFT_SET_MAP)
-			return -EINVAL;
-		if (tb[NFTA_SET_DLEN] != NULL)
-			return -EINVAL;
-	}
-
-	priv->klen = ntohl(nla_get_be32(tb[NFTA_SET_KLEN]));
-	if (priv->klen == 0 ||
-	    priv->klen > FIELD_SIZEOF(struct nft_data, data))
-		return -EINVAL;
-
-	nla_for_each_nested(nla, tb[NFTA_SET_ELEMENTS], rem) {
-		err = -EINVAL;
-		if (nla_type(nla) != NFTA_LIST_ELEM)
-			goto err1;
-
-		err = nft_set_elem_init(ctx, expr, nla, &new);
-		if (err < 0)
-			goto err1;
-
-		parent = NULL;
-		p = &priv->root.rb_node;
-		while (*p != NULL) {
-			parent = *p;
-			elem = rb_entry(parent, struct nft_set_elem, node);
-			d = nft_data_cmp(&elem->key, &new->key, priv->klen);
-			if (d < 0)
-				p = &parent->rb_left;
-			else if (d > 0)
-				p = &parent->rb_right;
-			else {
-				err = -EEXIST;
-				goto err2;
-			}
-		}
-		rb_link_node(&new->node, parent, p);
-		rb_insert_color(&new->node, &priv->root);
-	}
-
-	return 0;
-
-err2:
-	nft_set_elem_destroy(expr, new);
-err1:
-	nft_set_destroy(expr);
-	return err;
-}
-
-static int nft_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	struct nft_set *priv = nft_expr_priv(expr);
-	const struct nft_set_elem *elem;
-	struct rb_node *node;
-	struct nlattr *list;
-
-	if (priv->flags) {
-		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(priv->flags)))
-			goto nla_put_failure;
-	}
-
-	if (nla_put_be32(skb, NFTA_SET_SREG, htonl(priv->sreg)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_SET_KLEN, htonl(priv->klen)))
-		goto nla_put_failure;
-
-	if (priv->flags & NFT_SET_MAP) {
-		if (nla_put_be32(skb, NFTA_SET_DREG, htonl(priv->dreg)))
-			goto nla_put_failure;
-		if (nla_put_be32(skb, NFTA_SET_DLEN, htonl(priv->dlen)))
-			goto nla_put_failure;
-	}
-
-	list = nla_nest_start(skb, NFTA_SET_ELEMENTS);
-	if (list == NULL)
-		goto nla_put_failure;
-
-	for (node = rb_first(&priv->root); node; node = rb_next(node)) {
-		elem = rb_entry(node, struct nft_set_elem, node);
-		if (nft_set_elem_dump(skb, expr, elem) < 0)
-			goto nla_put_failure;
-	}
-
-	nla_nest_end(skb, list);
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_ops nft_set_ops __read_mostly = {
-	.name		= "set",
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_set)),
-	.owner		= THIS_MODULE,
-	.eval		= nft_set_eval,
-	.init		= nft_set_init,
-	.destroy	= nft_set_destroy,
-	.dump		= nft_set_dump,
-	.policy		= nft_set_policy,
-	.maxattr	= NFTA_SET_MAX,
-};
-
-static int __init nft_set_module_init(void)
-{
-	return nft_register_expr(&nft_set_ops);
-}
-
-static void __exit nft_set_module_exit(void)
-{
-	nft_unregister_expr(&nft_set_ops);
-}
-
-module_init(nft_set_module_init);
-module_exit(nft_set_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("set");
-- 
cgit v1.2.3


From 9370761c56b66aa5c65e069a7b010111a025018d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 10 Oct 2013 23:21:26 +0200
Subject: netfilter: nf_tables: convert built-in tables/chains to chain types

This patch converts built-in tables/chains to chain types that
allows you to deploy customized table and chain configurations from
userspace.

After this patch, you have to specify the chain type when
creating a new chain:

 add chain ip filter output { type filter hook input priority 0; }
                              ^^^^ ------

The existing chain types after this patch are: filter, route and
nat. Note that tables are just containers of chains with no specific
semantics, which is a significant change with regards to iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h         |  31 ++-
 include/uapi/linux/netfilter/nf_tables.h  |   2 +
 net/ipv4/netfilter/Kconfig                |   8 +-
 net/ipv4/netfilter/Makefile               |   4 +-
 net/ipv4/netfilter/nf_table_nat_ipv4.c    | 415 ------------------------------
 net/ipv4/netfilter/nf_table_route_ipv4.c  |  97 -------
 net/ipv4/netfilter/nf_tables_ipv4.c       |  21 ++
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   | 353 +++++++++++++++++++++++++
 net/ipv4/netfilter/nft_chain_route_ipv4.c |  86 +++++++
 net/ipv6/netfilter/Kconfig                |   4 +-
 net/ipv6/netfilter/Makefile               |   2 +-
 net/ipv6/netfilter/nf_table_route_ipv6.c  |  93 -------
 net/ipv6/netfilter/nf_tables_ipv6.c       |  22 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c |  82 ++++++
 net/netfilter/nf_tables_api.c             | 197 +++++++-------
 15 files changed, 682 insertions(+), 735 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_table_nat_ipv4.c
 delete mode 100644 net/ipv4/netfilter/nf_table_route_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_chain_nat_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_chain_route_ipv4.c
 delete mode 100644 net/ipv6/netfilter/nf_table_route_ipv6.c
 create mode 100644 net/ipv6/netfilter/nft_chain_route_ipv6.c

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 66d0359702c6..8403f7f52e81 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -336,7 +336,6 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
 
 enum nft_chain_flags {
 	NFT_BASE_CHAIN			= 0x1,
-	NFT_CHAIN_BUILTIN		= 0x2,
 };
 
 /**
@@ -362,14 +361,23 @@ struct nft_chain {
 	char				name[NFT_CHAIN_MAXNAMELEN];
 };
 
+enum nft_chain_type {
+	NFT_CHAIN_T_DEFAULT = 0,
+	NFT_CHAIN_T_ROUTE,
+	NFT_CHAIN_T_NAT,
+	NFT_CHAIN_T_MAX
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
+ *	@type: chain type
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
+	enum nft_chain_type		type;
 	struct nft_chain		chain;
 };
 
@@ -384,10 +392,6 @@ extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
 				 const struct net_device *out,
 				 int (*okfn)(struct sk_buff *));
 
-enum nft_table_flags {
-	NFT_TABLE_BUILTIN		= 0x1,
-};
-
 /**
  *	struct nft_table - nf_tables table
  *
@@ -431,8 +435,17 @@ struct nft_af_info {
 extern int nft_register_afinfo(struct nft_af_info *);
 extern void nft_unregister_afinfo(struct nft_af_info *);
 
-extern int nft_register_table(struct nft_table *, int family);
-extern void nft_unregister_table(struct nft_table *, int family);
+struct nf_chain_type {
+	unsigned int		hook_mask;
+	const char		*name;
+	enum nft_chain_type	type;
+	nf_hookfn		*fn[NF_MAX_HOOKS];
+	struct module		*me;
+	int			family;
+};
+
+extern int nft_register_chain_type(struct nf_chain_type *);
+extern void nft_unregister_chain_type(struct nf_chain_type *);
 
 extern int nft_register_expr(struct nft_expr_type *);
 extern void nft_unregister_expr(struct nft_expr_type *);
@@ -440,8 +453,8 @@ extern void nft_unregister_expr(struct nft_expr_type *);
 #define MODULE_ALIAS_NFT_FAMILY(family)	\
 	MODULE_ALIAS("nft-afinfo-" __stringify(family))
 
-#define MODULE_ALIAS_NFT_TABLE(family, name) \
-	MODULE_ALIAS("nft-table-" __stringify(family) "-" name)
+#define MODULE_ALIAS_NFT_CHAIN(family, name) \
+	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
 
 #define MODULE_ALIAS_NFT_EXPR(name) \
 	MODULE_ALIAS("nft-expr-" name)
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9e924014efe3..779cf951c8de 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,6 +115,7 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -122,6 +123,7 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_TYPE,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index eb1d56ece361..ae65fe98bfbe 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -44,13 +44,13 @@ config NFT_REJECT_IPV4
 	depends on NF_TABLES_IPV4
 	tristate "nf_tables IPv4 reject support"
 
-config NF_TABLE_ROUTE_IPV4
+config NFT_CHAIN_ROUTE_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables route table support"
+	tristate "IPv4 nf_tables route chain support"
 
-config NF_TABLE_NAT_IPV4
+config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables nat table support"
+	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index b2f01cd2cd65..91e0bd71a6d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -29,8 +29,8 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
 obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV4) += nf_table_route_ipv4.o
-obj-$(CONFIG_NF_TABLE_NAT_IPV4) += nf_table_nat_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nf_table_nat_ipv4.c
deleted file mode 100644
index 2ecce39077a3..000000000000
--- a/net/ipv4/netfilter/nf_table_nat_ipv4.c
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ip.h>
-
-struct nft_nat {
-	enum nft_registers	sreg_addr_min:8;
-	enum nft_registers	sreg_addr_max:8;
-	enum nft_registers	sreg_proto_min:8;
-	enum nft_registers	sreg_proto_max:8;
-	enum nf_nat_manip_type	type;
-};
-
-static void nft_nat_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
-		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
-	}
-
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = data[priv->sreg_proto_min].data[0];
-		range.max_proto.all = data[priv->sreg_proto_max].data[0];
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	data[NFT_REG_VERDICT].verdict =
-		nf_nat_setup_info(ct, &range, priv->type);
-}
-
-static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
-	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
-};
-
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_nat *priv = nft_expr_priv(expr);
-	int err;
-
-	if (tb[NFTA_NAT_TYPE] == NULL)
-		return -EINVAL;
-
-	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
-	case NFT_NAT_SNAT:
-		priv->type = NF_NAT_MANIP_SRC;
-		break;
-	case NFT_NAT_DNAT:
-		priv->type = NF_NAT_MANIP_DST;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MIN]) {
-		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
-		err = nft_validate_input_register(priv->sreg_addr_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MAX]) {
-		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
-		err = nft_validate_input_register(priv->sreg_addr_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_addr_max = priv->sreg_addr_min;
-
-	if (tb[NFTA_NAT_PROTO_MIN]) {
-		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
-		err = nft_validate_input_register(priv->sreg_proto_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_PROTO_MAX]) {
-		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
-		err = nft_validate_input_register(priv->sreg_proto_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_proto_max = priv->sreg_proto_min;
-
-	return 0;
-}
-
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-
-	switch (priv->type) {
-	case NF_NAT_MANIP_SRC:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
-			goto nla_put_failure;
-		break;
-	case NF_NAT_MANIP_DST:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
-			goto nla_put_failure;
-		break;
-	}
-
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_type nft_nat_type;
-static const struct nft_expr_ops nft_nat_ops = {
-	.type		= &nft_nat_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.eval		= nft_nat_eval,
-	.init		= nft_nat_init,
-	.dump		= nft_nat_dump,
-};
-
-static struct nft_expr_type nft_nat_type __read_mostly = {
-	.name		= "nat",
-	.ops		= &nft_nat_ops,
-	.policy		= nft_nat_policy,
-	.maxattr	= NFTA_NAT_MAX,
-	.owner		= THIS_MODULE,
-};
-
-/*
- * NAT table
- */
-
-static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
-			      struct sk_buff *skb,
-			      const struct net_device *in,
-			      const struct net_device *out,
-			      int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
-	unsigned int ret;
-
-	if (ct == NULL || nf_ct_is_untracked(ct))
-		return NF_ACCEPT;
-
-	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
-
-	nat = nfct_nat(ct);
-	if (nat == NULL) {
-		/* Conntrack module was loaded late, can't add extension. */
-		if (nf_ct_is_confirmed(ct))
-			return NF_ACCEPT;
-		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-		if (nat == NULL)
-			return NF_ACCEPT;
-	}
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED + IP_CT_IS_REPLY:
-		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   ops->hooknum))
-				return NF_DROP;
-			else
-				return NF_ACCEPT;
-		}
-		/* Fall through */
-	case IP_CT_NEW:
-		if (nf_nat_initialized(ct, maniptype))
-			break;
-
-		ret = nft_do_chain(ops, skb, in, out, okfn);
-		if (ret != NF_ACCEPT)
-			return ret;
-		if (!nf_nat_initialized(ct, maniptype)) {
-			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
-			if (ret != NF_ACCEPT)
-				return ret;
-		}
-	default:
-		break;
-	}
-
-	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
-}
-
-static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
-				      struct sk_buff *skb,
-				      const struct net_device *in,
-				      const struct net_device *out,
-				      int (*okfn)(struct sk_buff *))
-{
-	__be32 daddr = ip_hdr(skb)->daddr;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    ip_hdr(skb)->daddr != daddr) {
-		skb_dst_drop(skb);
-	}
-	return ret;
-}
-
-static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo __maybe_unused;
-	const struct nf_conn *ct __maybe_unused;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if (ct->tuplehash[dir].tuple.src.u3.ip !=
-		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
-		    ct->tuplehash[dir].tuple.src.u.all !=
-		    ct->tuplehash[!dir].tuple.dst.u.all)
-			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
-								ret : NF_DROP;
-	}
-#endif
-	return ret;
-}
-
-static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
-				  struct sk_buff *skb,
-				  const struct net_device *in,
-				  const struct net_device *out,
-				  int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo;
-	const struct nf_conn *ct;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
-		    ct->tuplehash[!dir].tuple.src.u3.ip) {
-			if (ip_route_me_harder(skb, RTN_UNSPEC))
-				ret = NF_DROP;
-		}
-#ifdef CONFIG_XFRM
-		else if (ct->tuplehash[dir].tuple.dst.u.all !=
-			 ct->tuplehash[!dir].tuple.src.u.all)
-			if (nf_xfrm_me_harder(skb, AF_INET))
-				ret = NF_DROP;
-#endif
-	}
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_nat_prerouting __read_mostly = {
-	.chain	= {
-		.name		= "PREROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_prerouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_prerouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_prerouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_postrouting __read_mostly = {
-	.chain	= {
-		.name		= "POSTROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_postrouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_postrouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_postrouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_output,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_output.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_input __read_mostly = {
-	.chain	= {
-		.name		= "INPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_input.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_fn,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_input.chain,
-	},
-};
-
-
-static struct nft_table nf_table_nat_ipv4 __read_mostly = {
-	.name	= "nat",
-	.chains	= LIST_HEAD_INIT(nf_table_nat_ipv4.chains),
-};
-
-static int __init nf_table_nat_init(void)
-{
-	int err;
-
-	list_add_tail(&nf_chain_nat_prerouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_postrouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_output.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_input.chain.list,
-		      &nf_table_nat_ipv4.chains);
-
-	err = nft_register_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
-	if (err < 0)
-		goto err1;
-
-	err = nft_register_expr(&nft_nat_type);
-	if (err < 0)
-		goto err2;
-
-	return 0;
-
-err2:
-	nft_unregister_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
-err1:
-	return err;
-}
-
-static void __exit nf_table_nat_exit(void)
-{
-	nft_unregister_expr(&nft_nat_type);
-	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
-}
-
-module_init(nf_table_nat_init);
-module_exit(nf_table_nat_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "nat");
-MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nf_table_route_ipv4.c b/net/ipv4/netfilter/nf_table_route_ipv4.c
deleted file mode 100644
index 4f257a1ed661..000000000000
--- a/net/ipv4/netfilter/nf_table_route_ipv4.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
-					struct sk_buff *skb,
-					const struct net_device *in,
-					const struct net_device *out,
-					int (*okfn)(struct sk_buff *))
-{
-	unsigned int ret;
-	u32 mark;
-	__be32 saddr, daddr;
-	u_int8_t tos;
-	const struct iphdr *iph;
-
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
-	mark = skb->mark;
-	iph = ip_hdr(skb);
-	saddr = iph->saddr;
-	daddr = iph->daddr;
-	tos = iph->tos;
-
-	ret = nft_do_chain(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_QUEUE) {
-		iph = ip_hdr(skb);
-
-		if (iph->saddr != saddr ||
-		    iph->daddr != daddr ||
-		    skb->mark != mark ||
-		    iph->tos != tos)
-			if (ip_route_me_harder(skb, RTN_UNSPEC))
-				ret = NF_DROP;
-	}
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
-};
-
-static struct nft_table nf_table_route_ipv4 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv4.chains),
-};
-
-static int __init nf_table_route_init(void)
-{
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv4.chains);
-	return nft_register_table(&nf_table_route_ipv4, NFPROTO_IPV4);
-}
-
-static void __exit nf_table_route_exit(void)
-{
-	nft_unregister_table(&nf_table_route_ipv4, NFPROTO_IPV4);
-}
-
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 63d0a3bf53d3..23525c4c0192 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -41,14 +42,34 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv4_init(void)
 {
+	nft_register_chain_type(&filter_ipv4);
 	return nft_register_afinfo(&nft_af_ipv4);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv4);
+	nft_unregister_chain_type(&filter_ipv4);
 }
 
 module_init(nf_tables_ipv4_init);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 000000000000..cd286306be85
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers	sreg_addr_min:8;
+	enum nft_registers	sreg_addr_max:8;
+	enum nft_registers	sreg_proto_min:8;
+	enum nft_registers	sreg_proto_max:8;
+	enum nf_nat_manip_type	type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type		= &nft_nat_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.eval		= nft_nat_eval,
+	.init		= nft_nat_init,
+	.dump		= nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name		= "nat",
+	.ops		= &nft_nat_ops,
+	.policy		= nft_nat_policy,
+	.maxattr	= NFTA_NAT_MAX,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * NAT chains
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		ret = nft_do_chain(ops, skb, in, out, okfn);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	__be32 daddr = ip_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ip_hdr(skb)->daddr != daddr) {
+		skb_dst_drop(skb);
+	}
+	return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.u3.ip !=
+		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
+		    ct->tuplehash[dir].tuple.src.u.all !=
+		    ct->tuplehash[!dir].tuple.dst.u.all)
+			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+								ret : NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+struct nf_chain_type nft_chain_nat_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_fn,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_nat_init(void)
+{
+	int err;
+
+	err = nft_register_chain_type(&nft_chain_nat_ipv4);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_nat_type);
+	if (err < 0)
+		goto err;
+
+	return 0;
+
+err:
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
+	return err;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+	nft_unregister_expr(&nft_nat_type);
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 000000000000..6b84e097b8fc
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	u32 mark;
+	__be32 saddr, daddr;
+	u_int8_t tos;
+	const struct iphdr *iph;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos)
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+	}
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_route_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+		[NF_INET_LOCAL_OUT]	= nf_route_table_hook,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_route_init(void)
+{
+	return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 5677e38eeca3..23833064b7b5 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,9 +29,9 @@ config NF_TABLES_IPV6
 	depends on NF_TABLES
 	tristate "IPv6 nf_tables support"
 
-config NF_TABLE_ROUTE_IPV6
+config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
-	tristate "IPv6 nf_tables route table support"
+	tristate "IPv6 nf_tables route chain support"
 
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 956af4492d10..be4913aa524d 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV6) += nf_table_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nf_table_route_ipv6.c b/net/ipv6/netfilter/nf_table_route_ipv6.c
deleted file mode 100644
index 48ac65c7b398..000000000000
--- a/net/ipv6/netfilter/nf_table_route_ipv6.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/route.h>
-
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
-					struct sk_buff *skb,
-					const struct net_device *in,
-					const struct net_device *out,
-					int (*okfn)(struct sk_buff *))
-{
-	unsigned int ret;
-	struct in6_addr saddr, daddr;
-	u_int8_t hop_limit;
-	u32 mark, flowlabel;
-
-	/* save source/dest address, mark, hoplimit, flowlabel, priority */
-	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
-	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
-	mark = skb->mark;
-	hop_limit = ipv6_hdr(skb)->hop_limit;
-
-	/* flowlabel and prio (includes version, which shouldn't change either */
-	flowlabel = *((u32 *)ipv6_hdr(skb));
-
-	ret = nft_do_chain(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_QUEUE &&
-	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
-	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
-	     skb->mark != mark ||
-	     ipv6_hdr(skb)->hop_limit != hop_limit ||
-	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
-		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
-
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP6_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
-};
-
-static struct nft_table nf_table_route_ipv6 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv6.chains),
-};
-
-static int __init nf_table_route_init(void)
-{
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv6.chains);
-	return nft_register_table(&nf_table_route_ipv6, NFPROTO_IPV6);
-}
-
-static void __exit nf_table_route_exit(void)
-{
-	nft_unregister_table(&nf_table_route_ipv6, NFPROTO_IPV6);
-}
-
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index e0717cea4913..3631d6238e6f 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -39,14 +40,33 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv6_init(void)
 {
+	nft_register_chain_type(&filter_ipv6);
 	return nft_register_afinfo(&nft_af_ipv6);
 }
-
 static void __exit nf_tables_ipv6_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv6);
+	nft_unregister_chain_type(&filter_ipv6);
 }
 
 module_init(nf_tables_ipv6_init);
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
new file mode 100644
index 000000000000..4cdc992fa067
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	struct in6_addr saddr, daddr;
+	u_int8_t hop_limit;
+	u32 mark, flowlabel;
+
+	/* save source/dest address, mark, hoplimit, flowlabel, priority */
+	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+	mark = skb->mark;
+	hop_limit = ipv6_hdr(skb)->hop_limit;
+
+	/* flowlabel and prio (includes version, which shouldn't change either */
+	flowlabel = *((u32 *)ipv6_hdr(skb));
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE &&
+	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+	     skb->mark != mark ||
+	     ipv6_hdr(skb)->hop_limit != hop_limit ||
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_route_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+                [NF_INET_LOCAL_OUT]	= nf_route_table_hook,
+        },
+        .me		= THIS_MODULE,
+};
+
+static int __init nft_chain_route_init(void)
+{
+	return nft_register_chain_type(&nft_chain_route_ipv6);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_route_ipv6);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6dac9a3c9c40..9c2d8d5af843 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -104,8 +104,7 @@ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
 }
 
 static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
-						const struct nlattr *nla,
-						bool autoload)
+						const struct nlattr *nla)
 {
 	struct nft_table *table;
 
@@ -116,16 +115,6 @@ static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
 	if (table != NULL)
 		return table;
 
-#ifdef CONFIG_MODULES
-	if (autoload) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-table-%u-%*.s", afi->family,
-			       nla_len(nla)-1, (const char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (nft_table_lookup(afi, nla))
-			return ERR_PTR(-EAGAIN);
-	}
-#endif
 	return ERR_PTR(-ENOENT);
 }
 
@@ -134,6 +123,39 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 	return ++table->hgenerator;
 }
 
+static struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static int __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+	int i;
+
+	for (i=0; i<NFT_CHAIN_T_MAX; i++) {
+		if (chain_type[family][i] != NULL &&
+		    !nla_strcmp(nla, chain_type[family][i]->name))
+			return i;
+	}
+	return -1;
+}
+
+static int nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+				       const struct nlattr *nla,
+				       bool autoload)
+{
+	int type;
+
+	type = __nf_tables_chain_type_lookup(afi->family, nla);
+#ifdef CONFIG_MODULES
+	if (type < 0 && autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-chain-%u-%*.s", afi->family,
+			       nla_len(nla)-1, (const char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		type = __nf_tables_chain_type_lookup(afi->family, nla);
+	}
+#endif
+	return type;
+}
+
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
 };
@@ -258,7 +280,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -294,7 +316,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 		return PTR_ERR(afi);
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(afi, name, false);
+	table = nf_tables_table_lookup(afi, name);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -335,13 +357,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	if (table->flags & NFT_TABLE_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (table->use)
 		return -EBUSY;
 
@@ -351,99 +370,34 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	return 0;
 }
 
-static struct nft_table *__nf_tables_table_lookup(const struct nft_af_info *afi,
-						  const char *name)
+int nft_register_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_table *table;
-
-	list_for_each_entry(table, &afi->tables, list) {
-		if (!strcmp(name, table->name))
-			return table;
-	}
-
-	return ERR_PTR(-ENOENT);
-}
-
-static int nf_tables_chain_notify(const struct sk_buff *oskb,
-				  const struct nlmsghdr *nlh,
-				  const struct nft_table *table,
-				  const struct nft_chain *chain,
-				  int event, int family);
-
-/**
- *	nft_register_table - register a built-in table
- *
- *	@table: the table to register
- *	@family: protocol family to register table with
- *
- *	Register a built-in table for use with nf_tables. Returns zero on
- *	success or a negative errno code otherwise.
- */
-int nft_register_table(struct nft_table *table, int family)
-{
-	struct nft_af_info *afi;
-	struct nft_table *t;
-	struct nft_chain *chain;
-	int err;
+	int err = 0;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-again:
-	afi = nf_tables_afinfo_lookup(family, true);
-	if (IS_ERR(afi)) {
-		err = PTR_ERR(afi);
-		if (err == -EAGAIN)
-			goto again;
-		goto err;
-	}
-
-	t = __nf_tables_table_lookup(afi, table->name);
-	if (IS_ERR(t)) {
-		err = PTR_ERR(t);
-		if (err != -ENOENT)
-			goto err;
-		t = NULL;
+	if (chain_type[ctype->family][ctype->type] != NULL) {
+		err = -EBUSY;
+		goto out;
 	}
 
-	if (t != NULL) {
-		err = -EEXIST;
-		goto err;
-	}
+	if (!try_module_get(ctype->me))
+		goto out;
 
-	table->flags |= NFT_TABLE_BUILTIN;
-	INIT_LIST_HEAD(&table->sets);
-	list_add_tail(&table->list, &afi->tables);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_NEWCHAIN, family);
-	err = 0;
-err:
+	chain_type[ctype->family][ctype->type] = ctype;
+out:
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return err;
 }
-EXPORT_SYMBOL_GPL(nft_register_table);
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
-/**
- *	nft_unregister_table - unregister a built-in table
- *
- *	@table: the table to unregister
- *	@family: protocol family to unregister table with
- *
- *	Unregister a built-in table for use with nf_tables.
- */
-void nft_unregister_table(struct nft_table *table, int family)
+void nft_unregister_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_chain *chain;
-
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_del(&table->list);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_DELCHAIN, family);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_DELTABLE, family);
+	chain_type[ctype->family][ctype->type] = NULL;
+	module_put(ctype->me);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
-EXPORT_SYMBOL_GPL(nft_unregister_table);
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
 
 /*
  * Chains
@@ -484,6 +438,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -526,6 +481,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
+
+		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
+			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
+				goto nla_put_failure;
 	}
 
 	return nlmsg_end(skb, nlh);
@@ -633,7 +592,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -680,7 +639,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -722,6 +681,17 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nf_hook_ops *ops;
+		nf_hookfn *hookfn;
+		u32 hooknum;
+		int type = NFT_CHAIN_T_DEFAULT;
+
+		if (nla[NFTA_CHAIN_TYPE]) {
+			type = nf_tables_chain_type_lookup(afi,
+							   nla[NFTA_CHAIN_TYPE],
+							   create);
+			if (type < 0)
+				return -ENOENT;
+		}
 
 		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
 				       nft_hook_policy);
@@ -730,12 +700,20 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
 		    ha[NFTA_HOOK_PRIORITY] == NULL)
 			return -EINVAL;
-		if (ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])) >= afi->nhooks)
+
+		hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		if (hooknum >= afi->nhooks)
 			return -EINVAL;
 
+		hookfn = chain_type[family][type]->fn[hooknum];
+		if (hookfn == NULL)
+			return -EOPNOTSUPP;
+
 		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
 		if (basechain == NULL)
 			return -ENOMEM;
+
+		basechain->type = type;
 		chain = &basechain->chain;
 
 		ops = &basechain->ops;
@@ -744,7 +722,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		ops->hooknum	= ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
 		ops->priority	= ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 		ops->priv	= chain;
-		ops->hook	= nft_do_chain;
+		ops->hook       = hookfn;
 		if (afi->hooks[ops->hooknum])
 			ops->hook = afi->hooks[ops->hooknum];
 
@@ -793,7 +771,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -801,9 +779,6 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
-	if (chain->flags & NFT_CHAIN_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (!list_empty(&chain->rules))
 		return -EBUSY;
 
@@ -1190,7 +1165,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1268,7 +1243,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1374,7 +1349,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1490,7 +1465,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 		return PTR_ERR(afi);
 
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], false);
+		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -1820,7 +1795,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2008,7 +1983,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-- 
cgit v1.2.3


From 0ca743a5599199152a31a7146b83213c786c2eb2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 14 Oct 2013 00:06:06 +0200
Subject: netfilter: nf_tables: add compatibility layer for x_tables

This patch adds the x_tables compatibility layer. This allows you
to use existing x_tables matches and targets from nf_tables.

This compatibility later allows us to use existing matches/targets
for features that are still missing in nf_tables. We can progressively
replace them with native nf_tables extensions. It also provides the
userspace compatibility software that allows you to express the
rule-set using the iptables syntax but using the nf_tables kernel
components.

In order to get this compatibility layer working, I've done the
following things:

* add NFNL_SUBSYS_NFT_COMPAT: this new nfnetlink subsystem is used
to query the x_tables match/target revision, so we don't need to
use the native x_table getsockopt interface.

* emulate xt structures: this required extending the struct nft_pktinfo
to include the fragment offset, which is already obtained from
ip[6]_tables and that is used by some matches/targets.

* add support for default policy to base chains, required to emulate
  x_tables.

* add NFTA_CHAIN_USE attribute to obtain the number of references to
  chains, required by x_tables emulation.

* add chain packet/byte counters using per-cpu.

* support 32-64 bits compat.

For historical reasons, this patch includes the following patches
that were posted in the netfilter-devel mailing list.

From Pablo Neira Ayuso:
* nf_tables: add default policy to base chains
* netfilter: nf_tables: add NFTA_CHAIN_USE attribute
* nf_tables: nft_compat: private data of target and matches in contiguous area
* nf_tables: validate hooks for compat match/target
* nf_tables: nft_compat: release cached matches/targets
* nf_tables: x_tables support as a compile time option
* nf_tables: fix alias for xtables over nftables module
* nf_tables: add packet and byte counters per chain
* nf_tables: fix per-chain counter stats if no counters are passed
* nf_tables: don't bump chain stats
* nf_tables: add protocol and flags for xtables over nf_tables
* nf_tables: add ip[6]t_entry emulation
* nf_tables: move specific layer 3 compat code to nf_tables_ipv[4|6]
* nf_tables: support 32bits-64bits x_tables compat
* nf_tables: fix compilation if CONFIG_COMPAT is disabled

From Patrick McHardy:
* nf_tables: move policy to struct nft_base_chain
* nf_tables: send notifications for base chain policy changes

From Alexander Primak:
* nf_tables: remove the duplicate NF_INET_LOCAL_OUT

From Nicolas Dichtel:
* nf_tables: fix compilation when nf-netlink is a module

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h               |  44 +-
 include/net/netfilter/nf_tables_ipv4.h          |  23 +
 include/net/netfilter/nf_tables_ipv6.h          |  30 +
 include/uapi/linux/netfilter/Kbuild             |   1 +
 include/uapi/linux/netfilter/nf_tables.h        |  32 +
 include/uapi/linux/netfilter/nf_tables_compat.h |  38 ++
 include/uapi/linux/netfilter/nfnetlink.h        |   3 +-
 net/ipv4/netfilter/nf_tables_ipv4.c             |  32 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c         |   6 +-
 net/ipv4/netfilter/nft_chain_route_ipv4.c       |   6 +-
 net/ipv6/netfilter/nf_tables_ipv6.c             |  33 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c       |   8 +-
 net/netfilter/Kconfig                           |   9 +
 net/netfilter/Makefile                          |   1 +
 net/netfilter/nf_tables_api.c                   | 220 ++++++-
 net/netfilter/nf_tables_core.c                  |  46 +-
 net/netfilter/nft_cmp.c                         |   3 +-
 net/netfilter/nft_compat.c                      | 768 ++++++++++++++++++++++++
 net/netfilter/nft_immediate.c                   |  12 +-
 net/netfilter/nft_payload.c                     |   4 +-
 20 files changed, 1241 insertions(+), 78 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables_ipv4.h
 create mode 100644 include/net/netfilter/nf_tables_ipv6.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables_compat.h
 create mode 100644 net/netfilter/nft_compat.c

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8403f7f52e81..a68f45f0fe2e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netlink.h>
 
@@ -15,8 +16,23 @@ struct nft_pktinfo {
 	u8				hooknum;
 	u8				nhoff;
 	u8				thoff;
+	/* for x_tables compatibility */
+	struct xt_action_param		xt;
 };
 
+static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
+				   const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out)
+{
+	pkt->skb = skb;
+	pkt->in = pkt->xt.in = in;
+	pkt->out = pkt->xt.out = out;
+	pkt->hooknum = pkt->xt.hooknum = ops->hooknum;
+	pkt->xt.family = ops->pf;
+}
+
 struct nft_data {
 	union {
 		u32				data[4];
@@ -57,6 +73,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  * 	@afi: address family info
  * 	@table: the table the chain is contained in
  * 	@chain: the chain the rule is contained in
+ *	@nla: netlink attributes
  */
 struct nft_ctx {
 	const struct sk_buff		*skb;
@@ -64,6 +81,7 @@ struct nft_ctx {
 	const struct nft_af_info	*afi;
 	const struct nft_table		*table;
 	const struct nft_chain		*chain;
+	const struct nlattr * const 	*nla;
 };
 
 struct nft_data_desc {
@@ -235,7 +253,8 @@ extern void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@maxattr: highest netlink attribute number
  */
 struct nft_expr_type {
-	const struct nft_expr_ops	*(*select_ops)(const struct nlattr * const tb[]);
+	const struct nft_expr_ops	*(*select_ops)(const struct nft_ctx *,
+						       const struct nlattr * const tb[]);
 	const struct nft_expr_ops	*ops;
 	struct list_head		list;
 	const char			*name;
@@ -253,6 +272,8 @@ struct nft_expr_type {
  *	@destroy: destruction function
  *	@dump: function to dump parameters
  *	@type: expression type
+ *	@validate: validate expression, called during loop detection
+ *	@data: extra data to attach to this expression operation
  */
 struct nft_expr;
 struct nft_expr_ops {
@@ -267,8 +288,11 @@ struct nft_expr_ops {
 	void				(*destroy)(const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
-	const struct nft_data *		(*get_verdict)(const struct nft_expr *expr);
+	int				(*validate)(const struct nft_ctx *ctx,
+						    const struct nft_expr *expr,
+						    const struct nft_data **data);
 	const struct nft_expr_type	*type;
+	void				*data;
 };
 
 #define NFT_EXPR_MAXATTR		16
@@ -368,16 +392,25 @@ enum nft_chain_type {
 	NFT_CHAIN_T_MAX
 };
 
+struct nft_stats {
+	u64 bytes;
+	u64 pkts;
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
  *	@type: chain type
+ *	@policy: default policy
+ *	@stats: per-cpu chain stats
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
 	enum nft_chain_type		type;
+	u8				policy;
+	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
 };
 
@@ -386,11 +419,8 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai
 	return container_of(chain, struct nft_base_chain, chain);
 }
 
-extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-				 struct sk_buff *skb,
-				 const struct net_device *in,
-				 const struct net_device *out,
-				 int (*okfn)(struct sk_buff *));
+extern unsigned int nft_do_chain_pktinfo(struct nft_pktinfo *pkt,
+					 const struct nf_hook_ops *ops);
 
 /**
  *	struct nft_table - nf_tables table
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
new file mode 100644
index 000000000000..1be1c2c197ee
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -0,0 +1,23 @@
+#ifndef _NF_TABLES_IPV4_H_
+#define _NF_TABLES_IPV4_H_
+
+#include <net/netfilter/nf_tables.h>
+#include <net/ip.h>
+
+static inline void
+nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	struct iphdr *ip;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	pkt->xt.thoff = ip_hdrlen(pkt->skb);
+	ip = ip_hdr(pkt->skb);
+	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+}
+
+#endif
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
new file mode 100644
index 000000000000..4a9b88a65963
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -0,0 +1,30 @@
+#ifndef _NF_TABLES_IPV6_H_
+#define _NF_TABLES_IPV6_H_
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ipv6.h>
+
+static inline int
+nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	int protohdr, thoff = 0;
+	unsigned short frag_off;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+	/* If malformed, drop it */
+	if (protohdr < 0)
+		return -1;
+
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = frag_off;
+
+	return 0;
+}
+
+#endif
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 6ce0b7f566a7..17c3af2c4bb9 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -6,6 +6,7 @@ header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
 header-y += nf_tables.h
+header-y += nf_tables_compat.h
 header-y += nf_nat.h
 header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 779cf951c8de..1563875e6942 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,7 +115,10 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_POLICY: numeric policy of the chain (NLA_U32)
+ * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32)
  * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
+ * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -123,7 +126,10 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_POLICY,
+	NFTA_CHAIN_USE,
 	NFTA_CHAIN_TYPE,
+	NFTA_CHAIN_COUNTERS,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
@@ -135,6 +141,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING)
  * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
  * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -142,10 +149,35 @@ enum nft_rule_attributes {
 	NFTA_RULE_CHAIN,
 	NFTA_RULE_HANDLE,
 	NFTA_RULE_EXPRESSIONS,
+	NFTA_RULE_COMPAT,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
 
+/**
+ * enum nft_rule_compat_flags - nf_tables rule compat flags
+ *
+ * @NFT_RULE_COMPAT_F_INV: invert the check result
+ */
+enum nft_rule_compat_flags {
+	NFT_RULE_COMPAT_F_INV	= (1 << 1),
+	NFT_RULE_COMPAT_F_MASK	= NFT_RULE_COMPAT_F_INV,
+};
+
+/**
+ * enum nft_rule_compat_attributes - nf_tables rule compat attributes
+ *
+ * @NFTA_RULE_COMPAT_PROTO: numerice value of handled protocol (NLA_U32)
+ * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32)
+ */
+enum nft_rule_compat_attributes {
+	NFTA_RULE_COMPAT_UNSPEC,
+	NFTA_RULE_COMPAT_PROTO,
+	NFTA_RULE_COMPAT_FLAGS,
+	__NFTA_RULE_COMPAT_MAX
+};
+#define NFTA_RULE_COMPAT_MAX	(__NFTA_RULE_COMPAT_MAX - 1)
+
 /**
  * enum nft_set_flags - nf_tables set flags
  *
diff --git a/include/uapi/linux/netfilter/nf_tables_compat.h b/include/uapi/linux/netfilter/nf_tables_compat.h
new file mode 100644
index 000000000000..8310f5f76551
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables_compat.h
@@ -0,0 +1,38 @@
+#ifndef _NFT_COMPAT_NFNETLINK_H_
+#define _NFT_COMPAT_NFNETLINK_H_
+
+enum nft_target_attributes {
+	NFTA_TARGET_UNSPEC,
+	NFTA_TARGET_NAME,
+	NFTA_TARGET_REV,
+	NFTA_TARGET_INFO,
+	__NFTA_TARGET_MAX
+};
+#define NFTA_TARGET_MAX		(__NFTA_TARGET_MAX - 1)
+
+enum nft_match_attributes {
+	NFTA_MATCH_UNSPEC,
+	NFTA_MATCH_NAME,
+	NFTA_MATCH_REV,
+	NFTA_MATCH_INFO,
+	__NFTA_MATCH_MAX
+};
+#define NFTA_MATCH_MAX		(__NFTA_MATCH_MAX - 1)
+
+#define NFT_COMPAT_NAME_MAX	32
+
+enum {
+	NFNL_MSG_COMPAT_GET,
+	NFNL_MSG_COMPAT_MAX
+};
+
+enum {
+	NFTA_COMPAT_UNSPEC = 0,
+	NFTA_COMPAT_NAME,
+	NFTA_COMPAT_REV,
+	NFTA_COMPAT_TYPE,
+	__NFTA_COMPAT_MAX,
+};
+#define NFTA_COMPAT_MAX (__NFTA_COMPAT_MAX - 1)
+
+#endif
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index d276c3bd55b8..288959404d54 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -54,6 +54,7 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
 #define NFNL_SUBSYS_CTHELPER		9
 #define NFNL_SUBSYS_NFTABLES		10
-#define NFNL_SUBSYS_COUNT		11
+#define NFNL_SUBSYS_NFT_COMPAT		11
+#define NFNL_SUBSYS_COUNT		12
 
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 23525c4c0192..c61cffb9b760 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -15,6 +15,8 @@
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 
 static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -22,6 +24,8 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct iphdr) ||
 		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
 		if (net_ratelimit())
@@ -29,8 +33,9 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv4 __read_mostly = {
@@ -42,6 +47,21 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+
+static unsigned int
+nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "filter",
@@ -52,11 +72,11 @@ static struct nf_chain_type filter_ipv4 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
 	},
 };
 
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index cd286306be85..e09c201adf84 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -23,6 +23,7 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
@@ -181,6 +182,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_conn_nat *nat;
 	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	struct nft_pktinfo pkt;
 	unsigned int ret;
 
 	if (ct == NULL || nf_ct_is_untracked(ct))
@@ -213,7 +215,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 		if (nf_nat_initialized(ct, maniptype))
 			break;
 
-		ret = nft_do_chain(ops, skb, in, out, okfn);
+		nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
 		if (ret != NF_ACCEPT)
 			return ret;
 		if (!nf_nat_initialized(ct, maniptype)) {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 6b84e097b8fc..4e6bf9a3d7aa 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -17,6 +17,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/route.h>
 #include <net/ip.h>
 
@@ -27,6 +28,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	u32 mark;
 	__be32 saddr, daddr;
 	u_int8_t tos;
@@ -37,13 +39,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
 	mark = skb->mark;
 	iph = ip_hdr(skb);
 	saddr = iph->saddr;
 	daddr = iph->daddr;
 	tos = iph->tos;
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE) {
 		iph = ip_hdr(skb);
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 3631d6238e6f..42f905a808a3 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -14,6 +14,7 @@
 #include <linux/ipv6.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 
 static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -21,14 +22,18 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
 		if (net_ratelimit())
 			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv6 __read_mostly = {
@@ -40,6 +45,22 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static unsigned int
+nft_do_chain_ipv6(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "filter",
@@ -50,11 +71,11 @@ static struct nf_chain_type filter_ipv6 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
 	},
 };
 
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 4cdc992fa067..3fe40f0456ad 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -19,6 +19,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 #include <net/route.h>
 
 static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
@@ -28,10 +29,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	struct in6_addr saddr, daddr;
 	u_int8_t hop_limit;
 	u32 mark, flowlabel;
 
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
 	/* save source/dest address, mark, hoplimit, flowlabel, priority */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
 	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
@@ -41,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u32 *)ipv6_hdr(skb));
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE &&
 	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
 	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa184a46bbf3..49e362707379 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,15 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_COMPAT
+	depends on NF_TABLES
+	depends on NETFILTER_XTABLES
+	tristate "Netfilter x_tables over nf_tables module"
+	help
+	  This is required if you intend to use any of existing
+	  x_tables match/target extensions over the nf_tables
+	  framework.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b6b78754e4cc..a6781450b6fb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -70,6 +70,7 @@ nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
 nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9c2d8d5af843..61e017b349cb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -438,7 +438,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 },
 	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
+	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -446,6 +448,33 @@ static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
 	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 },
 };
 
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+	struct nft_stats *cpu_stats, total;
+	struct nlattr *nest;
+	int cpu;
+
+	memset(&total, 0, sizeof(total));
+	for_each_possible_cpu(cpu) {
+		cpu_stats = per_cpu_ptr(stats, cpu);
+		total.pkts += cpu_stats->pkts;
+		total.bytes += cpu_stats->bytes;
+	}
+	nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+	    nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -ENOSPC;
+}
+
 static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 				     int event, u32 flags, int family,
 				     const struct nft_table *table,
@@ -472,8 +501,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		goto nla_put_failure;
 
 	if (chain->flags & NFT_BASE_CHAIN) {
-		const struct nf_hook_ops *ops = &nft_base_chain(chain)->ops;
-		struct nlattr *nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+		const struct nft_base_chain *basechain = nft_base_chain(chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+		struct nlattr *nest;
+
+		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
 		if (nest == NULL)
 			goto nla_put_failure;
 		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
@@ -482,11 +514,21 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
 
+		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+				 htonl(basechain->policy)))
+			goto nla_put_failure;
+
 		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
 			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
 				goto nla_put_failure;
+
+		if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+			goto nla_put_failure;
 	}
 
+	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+		goto nla_put_failure;
+
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
@@ -617,6 +659,67 @@ err:
 	return err;
 }
 
+static int
+nf_tables_chain_policy(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	switch (ntohl(nla_get_be32(attr))) {
+	case NF_DROP:
+		chain->policy = NF_DROP;
+		break;
+	case NF_ACCEPT:
+		chain->policy = NF_ACCEPT;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 },
+	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
+};
+
+static int
+nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	struct nlattr *tb[NFTA_COUNTER_MAX+1];
+	struct nft_stats __percpu *newstats;
+	struct nft_stats *stats;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+		return -EINVAL;
+
+	newstats = alloc_percpu(struct nft_stats);
+	if (newstats == NULL)
+		return -ENOMEM;
+
+	/* Restore old counters on this cpu, no problem. Per-cpu statistics
+	 * are not exposed to userspace.
+	 */
+	stats = this_cpu_ptr(newstats);
+	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+
+	if (chain->stats) {
+		/* nfnl_lock is held, add some nfnl function for this, later */
+		struct nft_stats __percpu *oldstats =
+			rcu_dereference_protected(chain->stats, 1);
+
+		rcu_assign_pointer(chain->stats, newstats);
+		synchronize_rcu();
+		free_percpu(oldstats);
+	} else
+		rcu_assign_pointer(chain->stats, newstats);
+
+	return 0;
+}
+
 static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[])
@@ -626,7 +729,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
-	struct nft_base_chain *basechain;
+	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
 	int family = nfmsg->nfgen_family;
 	u64 handle = 0;
@@ -673,6 +776,26 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
 			return -EEXIST;
 
+		if (nla[NFTA_CHAIN_POLICY]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_chain_policy(nft_base_chain(chain),
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0)
+				return err;
+		}
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_counters(nft_base_chain(chain),
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0)
+				return err;
+		}
+
 		if (nla[NFTA_CHAIN_HANDLE] && name)
 			nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
@@ -727,6 +850,36 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			ops->hook = afi->hooks[ops->hooknum];
 
 		chain->flags |= NFT_BASE_CHAIN;
+
+		if (nla[NFTA_CHAIN_POLICY]) {
+			err = nf_tables_chain_policy(basechain,
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else
+			basechain->policy = NF_ACCEPT;
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			err = nf_tables_counters(basechain,
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else {
+			struct nft_stats __percpu *newstats;
+
+			newstats = alloc_percpu(struct nft_stats);
+			if (newstats == NULL)
+				return -ENOMEM;
+
+			rcu_assign_pointer(nft_base_chain(chain)->stats,
+					   newstats);
+		}
 	} else {
 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 		if (chain == NULL)
@@ -739,6 +892,15 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	list_add_tail(&chain->list, &table->chains);
 	table->use++;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0) {
+			free_percpu(basechain->stats);
+			kfree(basechain);
+			return err;
+		}
+	}
 notify:
 	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
 			       family);
@@ -751,9 +913,10 @@ static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
 
 	BUG_ON(chain->use > 0);
 
-	if (chain->flags & NFT_BASE_CHAIN)
+	if (chain->flags & NFT_BASE_CHAIN) {
+		free_percpu(nft_base_chain(chain)->stats);
 		kfree(nft_base_chain(chain));
-	else
+	} else
 		kfree(chain);
 }
 
@@ -801,13 +964,15 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 const struct nlmsghdr *nlh,
 			 const struct nft_af_info *afi,
 			 const struct nft_table *table,
-			 const struct nft_chain *chain)
+			 const struct nft_chain *chain,
+			 const struct nlattr * const *nla)
 {
 	ctx->skb   = skb;
 	ctx->nlh   = nlh;
 	ctx->afi   = afi;
 	ctx->table = table;
 	ctx->chain = chain;
+	ctx->nla   = nla;
 }
 
 /*
@@ -910,7 +1075,8 @@ struct nft_expr_info {
 	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1];
 };
 
-static int nf_tables_expr_parse(const struct nlattr *nla,
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+				const struct nlattr *nla,
 				struct nft_expr_info *info)
 {
 	const struct nft_expr_type *type;
@@ -935,7 +1101,8 @@ static int nf_tables_expr_parse(const struct nlattr *nla,
 		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
 
 	if (type->select_ops != NULL) {
-		ops = type->select_ops((const struct nlattr * const *)info->tb);
+		ops = type->select_ops(ctx,
+				       (const struct nlattr * const *)info->tb);
 		if (IS_ERR(ops)) {
 			err = PTR_ERR(ops);
 			goto err1;
@@ -1012,6 +1179,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
 	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
+	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED },
 };
 
 static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1269,6 +1437,8 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		handle = nf_tables_alloc_handle(table);
 	}
 
+	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
 	n = 0;
 	size = 0;
 	if (nla[NFTA_RULE_EXPRESSIONS]) {
@@ -1278,7 +1448,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 				goto err1;
 			if (n == NFT_RULE_MAXEXPRS)
 				goto err1;
-			err = nf_tables_expr_parse(tmp, &info[n]);
+			err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
 			if (err < 0)
 				goto err1;
 			size += info[n].ops->size;
@@ -1294,7 +1464,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	rule->handle = handle;
 	rule->dlen   = size;
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, chain);
 	expr = nft_expr_first(rule);
 	for (i = 0; i < n; i++) {
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
@@ -1304,13 +1473,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		expr = nft_expr_next(expr);
 	}
 
-	/* Register hook when first rule is inserted into a base chain */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN) {
-		err = nf_register_hook(&nft_base_chain(chain)->ops);
-		if (err < 0)
-			goto err2;
-	}
-
 	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
 		list_replace_rcu(&old_rule->list, &rule->list);
 		nf_tables_rule_destroy(old_rule);
@@ -1379,10 +1541,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 		}
 	}
 
-	/* Unregister hook when last rule from base chain is deleted */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN)
-		nf_unregister_hook(&nft_base_chain(chain)->ops);
-
 	return 0;
 }
 
@@ -1470,7 +1628,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 			return PTR_ERR(table);
 	}
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -1799,7 +1957,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
 
 	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
 	if (IS_ERR(set)) {
@@ -1987,7 +2145,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -2435,23 +2593,27 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 {
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
-	const struct nft_data *data;
 	const struct nft_set *set;
 	struct nft_set_binding *binding;
 	struct nft_set_iter iter;
-	int err;
 
 	if (ctx->chain == chain)
 		return -ELOOP;
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			if (!expr->ops->get_verdict)
+			const struct nft_data *data = NULL;
+			int err;
+
+			if (!expr->ops->validate)
 				continue;
 
-			data = expr->ops->get_verdict(expr);
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+
 			if (data == NULL)
-				break;
+				continue;
 
 			switch (data->verdict) {
 			case NFT_JUMP:
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 9aede59ed2d7..e51a45c12128 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -60,27 +60,34 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 	return true;
 }
 
-unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-			  struct sk_buff *skb,
-			  const struct net_device *in,
-			  const struct net_device *out,
-			  int (*okfn)(struct sk_buff *))
+struct nft_jumpstack {
+	const struct nft_chain	*chain;
+	const struct nft_rule	*rule;
+};
+
+static inline void
+nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt,
+		struct nft_jumpstack *jumpstack, unsigned int stackptr)
+{
+	struct nft_stats __percpu *stats;
+	const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this;
+
+	rcu_read_lock_bh();
+	stats = rcu_dereference(nft_base_chain(chain)->stats);
+	__this_cpu_inc(stats->pkts);
+	__this_cpu_add(stats->bytes, pkt->skb->len);
+	rcu_read_unlock_bh();
+}
+
+unsigned int
+nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
 {
 	const struct nft_chain *chain = ops->priv;
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
 	struct nft_data data[NFT_REG_MAX + 1];
-	const struct nft_pktinfo pkt = {
-		.skb		= skb,
-		.in		= in,
-		.out		= out,
-		.hooknum	= ops->hooknum,
-	};
 	unsigned int stackptr = 0;
-	struct {
-		const struct nft_chain	*chain;
-		const struct nft_rule	*rule;
-	} jumpstack[NFT_JUMP_STACK_SIZE];
+	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
 
 do_chain:
 	rule = list_entry(&chain->rules, struct nft_rule, list);
@@ -91,8 +98,8 @@ next_rule:
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, data);
 			else if (expr->ops != &nft_payload_fast_ops ||
-				 !nft_payload_fast_eval(expr, data, &pkt))
-				expr->ops->eval(expr, data, &pkt);
+				 !nft_payload_fast_eval(expr, data, pkt))
+				expr->ops->eval(expr, data, pkt);
 
 			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
 				break;
@@ -135,10 +142,11 @@ next_rule:
 		rule  = jumpstack[stackptr].rule;
 		goto next_rule;
 	}
+	nft_chain_stats(chain, pkt, jumpstack, stackptr);
 
-	return NF_ACCEPT;
+	return nft_base_chain(chain)->policy;
 }
-EXPORT_SYMBOL_GPL(nft_do_chain);
+EXPORT_SYMBOL_GPL(nft_do_chain_pktinfo);
 
 int __init nf_tables_core_module_init(void)
 {
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 37134f3e84fb..954925db414d 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -162,7 +162,8 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
 	.dump		= nft_cmp_fast_dump,
 };
 
-static const struct nft_expr_ops *nft_cmp_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
 	struct nft_data_desc desc;
 	struct nft_data data;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 000000000000..4811f762e060
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,768 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <asm/uaccess.h> /* for set_fs */
+#include <net/netfilter/nf_tables.h>
+
+union nft_entry {
+	struct ipt_entry e4;
+	struct ip6t_entry e6;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+	par->target	= xt;
+	par->targinfo	= xt_info;
+	par->hotdrop	= false;
+}
+
+static void nft_target_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	int ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+	ret = target->target(skb, &pkt->xt);
+
+	if (pkt->xt.hotdrop)
+		ret = NF_DROP;
+
+	switch(ret) {
+	case XT_CONTINUE:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	default:
+		data[NFT_REG_VERDICT].verdict = ret;
+		break;
+	}
+	return;
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+	[NFTA_TARGET_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_TARGET_REV]	= { .type = NLA_U32 },
+	[NFTA_TARGET_INFO]	= { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+			   const struct nft_ctx *ctx,
+			   struct xt_target *target, void *info,
+			   union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->target	= target;
+	par->targinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (t->compat_from_user) {
+		int pad;
+
+		t->compat_from_user(out, in);
+		pad = XT_ALIGN(t->targetsize) - t->targetsize;
+		if (pad > 0)
+			memset(out + t->targetsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(t->targetsize));
+}
+
+static inline int nft_compat_target_offset(struct xt_target *target)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_target_offset(target);
+#else
+	return 0;
+#endif
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+	[NFTA_RULE_COMPAT_PROTO]	= { .type = NLA_U32 },
+	[NFTA_RULE_COMPAT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static u8 nft_parse_compat(const struct nlattr *attr, bool *inv)
+{
+	struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+	u32 flags;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+			       nft_rule_compat_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+		return -EINVAL;
+
+	flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+	if (flags & ~NFT_RULE_COMPAT_F_MASK)
+		return -EINVAL;
+	if (flags & NFT_RULE_COMPAT_F_INV)
+		*inv = true;
+
+	return ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct xt_tgchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+	ret = xt_check_target(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	/* The standard target cannot be used */
+	if (target->target == NULL) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+err:
+	module_put(target->me);
+	return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_expr *expr)
+{
+	struct xt_target *target = expr->ops->data;
+
+	module_put(target->me);
+}
+
+static int
+target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (t->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		t->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in);
+
+	return ret;
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct xt_target *target = expr->ops->data;
+	void *info = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+	    nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+	    target_dump_info(skb, target, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **data)
+{
+	struct xt_target *target = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & target->hooks)
+			return 0;
+
+		/* This target is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+			   struct nft_data data[NFT_REG_MAX + 1],
+			   const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	bool ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+	ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+	if (pkt->xt.hotdrop) {
+		data[NFT_REG_VERDICT].verdict = NF_DROP;
+		return;
+	}
+
+	switch(ret) {
+	case true:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	case false:
+		data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+		break;
+	}
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_MATCH_REV]	= { .type = NLA_U32 },
+	[NFTA_MATCH_INFO]	= { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+			  struct xt_match *match, void *info,
+			  union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->match	= match;
+	par->matchinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (m->compat_from_user) {
+		int pad;
+
+		m->compat_from_user(out, in);
+		pad = XT_ALIGN(m->matchsize) - m->matchsize;
+		if (pad > 0)
+			memset(out + m->matchsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(m->matchsize));
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct xt_mtchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+	ret = xt_check_match(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	return 0;
+err:
+	module_put(match->me);
+	return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_expr *expr)
+{
+	struct xt_match *match = expr->ops->data;
+
+	module_put(match->me);
+}
+
+static int
+match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (m->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		m->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in);
+
+	return ret;
+}
+
+static inline int nft_compat_match_offset(struct xt_match *match)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_match_offset(match);
+#else
+	return 0;
+#endif
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+
+	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+	    nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+	    match_dump_info(skb, match, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nft_data **data)
+{
+	struct xt_match *match = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & match->hooks)
+			return 0;
+
+		/* This match is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+		      int event, u16 family, const char *name,
+		      int rev, int target)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = family;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+		const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret = 0, target;
+	struct nfgenmsg *nfmsg;
+	const char *fmt;
+	const char *name;
+	u32 rev;
+	struct sk_buff *skb2;
+
+	if (tb[NFTA_COMPAT_NAME] == NULL ||
+	    tb[NFTA_COMPAT_REV] == NULL ||
+	    tb[NFTA_COMPAT_TYPE] == NULL)
+		return -EINVAL;
+
+	name = nla_data(tb[NFTA_COMPAT_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+	nfmsg = nlmsg_data(nlh);
+
+	switch(nfmsg->nfgen_family) {
+	case AF_INET:
+		fmt = "ipt_%s";
+		break;
+	case AF_INET6:
+		fmt = "ip6t_%s";
+		break;
+	default:
+		pr_err("nft_compat: unsupported protocol %d\n",
+			nfmsg->nfgen_family);
+		return -EINVAL;
+	}
+
+	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+						 rev, target, &ret),
+						 fmt, name);
+
+	if (ret < 0)
+		return ret;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	/* include the best revision for this extension in the message */
+	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+				  nlh->nlmsg_seq,
+				  NFNL_MSG_TYPE(nlh->nlmsg_type),
+				  NFNL_MSG_COMPAT_GET,
+				  nfmsg->nfgen_family,
+				  name, ret, target) <= 0) {
+		kfree_skb(skb2);
+		return -ENOSPC;
+	}
+
+	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+				MSG_DONTWAIT);
+	if (ret > 0)
+		ret = 0;
+
+	return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+	[NFTA_COMPAT_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NFT_COMPAT_NAME_MAX-1 },
+	[NFTA_COMPAT_REV]	= { .type = NLA_U32 },
+	[NFTA_COMPAT_TYPE]	= { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get,
+					    .attr_count = NFTA_COMPAT_MAX,
+					    .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+	.name		= "nft-compat",
+	.subsys_id	= NFNL_SUBSYS_NFT_COMPAT,
+	.cb_count	= NFNL_MSG_COMPAT_MAX,
+	.cb		= nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+	struct list_head	head;
+	struct nft_expr_ops	ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_match;
+	struct xt_match *match;
+	char *mt_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_MATCH_NAME] == NULL ||
+	    tb[NFTA_MATCH_REV] == NULL ||
+	    tb[NFTA_MATCH_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing match if it's already loaded. */
+	list_for_each_entry(nft_match, &nft_match_list, head) {
+		struct xt_match *match = nft_match->ops.data;
+
+		if (strcmp(match->name, mt_name) == 0 &&
+		    match->revision == rev && match->family == family)
+			return &nft_match->ops;
+	}
+
+	match = xt_request_find_match(family, mt_name, rev);
+	if (IS_ERR(match))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this match, allocate operations */
+	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_match == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_match->ops.type = &nft_match_type;
+	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) +
+					    nft_compat_match_offset(match));
+	nft_match->ops.eval = nft_match_eval;
+	nft_match->ops.init = nft_match_init;
+	nft_match->ops.destroy = nft_match_destroy;
+	nft_match->ops.dump = nft_match_dump;
+	nft_match->ops.validate = nft_match_validate;
+	nft_match->ops.data = match;
+
+	list_add(&nft_match->head, &nft_match_list);
+
+	return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+	struct nft_xt *nft_match;
+
+	list_for_each_entry(nft_match, &nft_match_list, head)
+		kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+	.name		= "match",
+	.select_ops	= nft_match_select_ops,
+	.policy		= nft_match_policy,
+	.maxattr	= NFTA_MATCH_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_target;
+	struct xt_target *target;
+	char *tg_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_TARGET_NAME] == NULL ||
+	    tb[NFTA_TARGET_REV] == NULL ||
+	    tb[NFTA_TARGET_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing target if it's already loaded. */
+	list_for_each_entry(nft_target, &nft_match_list, head) {
+		struct xt_target *target = nft_target->ops.data;
+
+		if (strcmp(target->name, tg_name) == 0 &&
+		    target->revision == rev && target->family == family)
+			return &nft_target->ops;
+	}
+
+	target = xt_request_find_target(family, tg_name, rev);
+	if (IS_ERR(target))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this target, allocate operations */
+	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_target == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_target->ops.type = &nft_target_type;
+	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) +
+					     nft_compat_target_offset(target));
+	nft_target->ops.eval = nft_target_eval;
+	nft_target->ops.init = nft_target_init;
+	nft_target->ops.destroy = nft_target_destroy;
+	nft_target->ops.dump = nft_target_dump;
+	nft_target->ops.validate = nft_target_validate;
+	nft_target->ops.data = target;
+
+	list_add(&nft_target->head, &nft_target_list);
+
+	return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+	struct nft_xt *nft_target;
+
+	list_for_each_entry(nft_target, &nft_target_list, head)
+		kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+	.name		= "target",
+	.select_ops	= nft_target_select_ops,
+	.policy		= nft_target_policy,
+	.maxattr	= NFTA_TARGET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+	int ret;
+
+	ret = nft_register_expr(&nft_match_type);
+	if (ret < 0)
+		return ret;
+
+	ret = nft_register_expr(&nft_target_type);
+	if (ret < 0)
+		goto err_match;
+
+	ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+	if (ret < 0) {
+		pr_err("nft_compat: cannot register with nfnetlink.\n");
+		goto err_target;
+	}
+
+	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+	return ret;
+
+err_target:
+	nft_unregister_expr(&nft_target_type);
+err_match:
+	nft_unregister_expr(&nft_match_type);
+	return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+	nft_unregister_expr(&nft_target_type);
+	nft_unregister_expr(&nft_match_type);
+	nft_match_release();
+	nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 1bfeeaf865b6..f169501f1ad4 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -90,14 +90,16 @@ nla_put_failure:
 	return -1;
 }
 
-static const struct nft_data *nft_immediate_get_verdict(const struct nft_expr *expr)
+static int nft_immediate_validate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr,
+				  const struct nft_data **data)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
 	if (priv->dreg == NFT_REG_VERDICT)
-		return &priv->data;
-	else
-		return NULL;
+		*data = &priv->data;
+
+	return 0;
 }
 
 static struct nft_expr_type nft_imm_type;
@@ -108,7 +110,7 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.init		= nft_immediate_init,
 	.destroy	= nft_immediate_destroy,
 	.dump		= nft_immediate_dump,
-	.get_verdict	= nft_immediate_get_verdict,
+	.validate	= nft_immediate_validate,
 };
 
 static struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7cf13f7e1e94..bc8bdb2c1ba7 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -107,7 +107,9 @@ const struct nft_expr_ops nft_payload_fast_ops = {
 	.dump		= nft_payload_dump,
 };
 
-static const struct nft_expr_ops *nft_payload_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_payload_select_ops(const struct nft_ctx *ctx,
+		       const struct nlattr * const tb[])
 {
 	enum nft_payload_bases base;
 	unsigned int offset, len;
-- 
cgit v1.2.3


From 9ddf63235749a9efa1fad2eeb74be2ee9b580f8d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 10 Oct 2013 13:26:33 +0200
Subject: netfilter: nf_tables: add support for dormant tables

This patch allows you to temporarily disable an entire table.
You can change the state of a dormant table via NFT_MSG_NEWTABLE
messages. Using this operation you can wake up a table, so their
chains are registered.

This provides atomicity at chain level. Thus, the rule-set of one
chain is applied at once, avoiding any possible intermediate state
in every chain. Still, the chains that belongs to a table are
registered consecutively. This also allows you to have inactive
tables in the kernel.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 11 ++++
 net/netfilter/nf_tables_api.c            | 97 +++++++++++++++++++++++++++++---
 2 files changed, 101 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 1563875e6942..a9c4bce1988f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -96,14 +96,25 @@ enum nft_hook_attributes {
 };
 #define NFTA_HOOK_MAX		(__NFTA_HOOK_MAX - 1)
 
+/**
+ * enum nft_table_flags - nf_tables table flags
+ *
+ * @NFT_TABLE_F_DORMANT: this table is not active
+ */
+enum nft_table_flags {
+	NFT_TABLE_F_DORMANT	= 0x1,
+};
+
 /**
  * enum nft_table_attributes - nf_tables table netlink attributes
  *
  * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
+ * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32)
  */
 enum nft_table_attributes {
 	NFTA_TABLE_UNSPEC,
 	NFTA_TABLE_NAME,
+	NFTA_TABLE_FLAGS,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 61e017b349cb..a4dd7ce5ec3e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -158,6 +158,7 @@ static int nf_tables_chain_type_lookup(const struct nft_af_info *afi,
 
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
+	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -177,7 +178,8 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
 	nfmsg->version		= NFNETLINK_V0;
 	nfmsg->res_id		= 0;
 
-	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name))
+	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
+	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)))
 		goto nla_put_failure;
 
 	return nlmsg_end(skb, nlh);
@@ -301,6 +303,74 @@ err:
 	return err;
 }
 
+static int nf_tables_table_enable(struct nft_table *table)
+{
+	struct nft_chain *chain;
+	int err, i = 0;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0)
+			goto err;
+
+		i++;
+	}
+	return 0;
+err:
+	list_for_each_entry(chain, &table->chains, list) {
+		if (i-- <= 0)
+			break;
+
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+	}
+	return err;
+}
+
+static int nf_tables_table_disable(struct nft_table *table)
+{
+	struct nft_chain *chain;
+
+	list_for_each_entry(chain, &table->chains, list)
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+
+	return 0;
+}
+
+static int nf_tables_updtable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[],
+			      struct nft_af_info *afi, struct nft_table *table)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	int family = nfmsg->nfgen_family, ret = 0;
+
+	if (nla[NFTA_TABLE_FLAGS]) {
+		__be32 flags;
+
+		flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
+		if (flags & ~NFT_TABLE_F_DORMANT)
+			return -EINVAL;
+
+		if ((flags & NFT_TABLE_F_DORMANT) &&
+		    !(table->flags & NFT_TABLE_F_DORMANT)) {
+			ret = nf_tables_table_disable(table);
+			if (ret >= 0)
+				table->flags |= NFT_TABLE_F_DORMANT;
+		} else if (!(flags & NFT_TABLE_F_DORMANT) &&
+			   table->flags & NFT_TABLE_F_DORMANT) {
+			ret = nf_tables_table_enable(table);
+			if (ret >= 0)
+				table->flags &= ~NFT_TABLE_F_DORMANT;
+		}
+		if (ret < 0)
+			goto err;
+	}
+
+	nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family);
+err:
+	return ret;
+}
+
 static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[])
@@ -328,7 +398,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
-		return 0;
+		return nf_tables_updtable(nlsk, skb, nlh, nla, afi, table);
 	}
 
 	table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL);
@@ -339,6 +409,18 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 
+	if (nla[NFTA_TABLE_FLAGS]) {
+		__be32 flags;
+
+		flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
+		if (flags & ~NFT_TABLE_F_DORMANT) {
+			kfree(table);
+			return -EINVAL;
+		}
+
+		table->flags |= flags;
+	}
+
 	list_add_tail(&table->list, &afi->tables);
 	nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family);
 	return 0;
@@ -890,10 +972,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	chain->handle = nf_tables_alloc_handle(table);
 	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
-	list_add_tail(&chain->list, &table->chains);
-	table->use++;
-
-	if (chain->flags & NFT_BASE_CHAIN) {
+	if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+	    chain->flags & NFT_BASE_CHAIN) {
 		err = nf_register_hook(&nft_base_chain(chain)->ops);
 		if (err < 0) {
 			free_percpu(basechain->stats);
@@ -901,6 +981,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			return err;
 		}
 	}
+	list_add_tail(&chain->list, &table->chains);
+	table->use++;
 notify:
 	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
 			       family);
@@ -948,7 +1030,8 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	list_del(&chain->list);
 	table->use--;
 
-	if (chain->flags & NFT_BASE_CHAIN)
+	if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+	    chain->flags & NFT_BASE_CHAIN)
 		nf_unregister_hook(&nft_base_chain(chain)->ops);
 
 	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN,
-- 
cgit v1.2.3


From eb31628e37a0a4e01fffd79dcc7f815d2357f53a Mon Sep 17 00:00:00 2001
From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Date: Thu, 10 Oct 2013 13:39:19 +0200
Subject: netfilter: nf_tables: Add support for IPv6 NAT

This patch generalizes the NAT expression to support both IPv4 and IPv6
using the existing IPv4/IPv6 NAT infrastructure. This also adds the
NAT chain type for IPv6.

This patch collapses the following patches that were posted to the
netfilter-devel mailing list, from Tomasz:

* nf_tables: Change NFTA_NAT_ attributes to better semantic significance
* nf_tables: Split IPv4 NAT into NAT expression and IPv4 NAT chain
* nf_tables: Add support for IPv6 NAT expression
* nf_tables: Add support for IPv6 NAT chain
* nf_tables: Fix up build issue on IPv6 NAT support

And, from Pablo Neira Ayuso:

* fix missing dependencies in nft_chain_nat

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  18 +--
 net/ipv4/netfilter/Kconfig               |   1 +
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  | 156 +---------------------
 net/ipv6/netfilter/Kconfig               |   5 +
 net/ipv6/netfilter/Makefile              |   1 +
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  | 211 +++++++++++++++++++++++++++++
 net/netfilter/Kconfig                    |   6 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_nat.c                  | 220 +++++++++++++++++++++++++++++++
 9 files changed, 457 insertions(+), 162 deletions(-)
 create mode 100644 net/ipv6/netfilter/nft_chain_nat_ipv6.c
 create mode 100644 net/netfilter/nft_nat.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a9c4bce1988f..7d4a1992f89c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -695,18 +695,20 @@ enum nft_nat_types {
  * enum nft_nat_attributes - nf_tables nat expression netlink attributes
  *
  * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types)
- * @NFTA_NAT_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
- * @NFTA_NAT_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_FAMILY: NAT family (NLA_U32)
+ * @NFTA_NAT_REG_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_nat_attributes {
 	NFTA_NAT_UNSPEC,
 	NFTA_NAT_TYPE,
-	NFTA_NAT_ADDR_MIN,
-	NFTA_NAT_ADDR_MAX,
-	NFTA_NAT_PROTO_MIN,
-	NFTA_NAT_PROTO_MAX,
+	NFTA_NAT_FAMILY,
+	NFTA_NAT_REG_ADDR_MIN,
+	NFTA_NAT_REG_ADDR_MAX,
+	NFTA_NAT_REG_PROTO_MIN,
+	NFTA_NAT_REG_PROTO_MAX,
 	__NFTA_NAT_MAX
 };
 #define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ae65fe98bfbe..1f37ef67f1ac 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -50,6 +50,7 @@ config NFT_CHAIN_ROUTE_IPV4
 
 config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
+	depends on NF_NAT_IPV4 && NFT_NAT
 	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index e09c201adf84..cf2c792cd971 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,10 +15,8 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
-#include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_nat.h>
@@ -27,147 +26,6 @@
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
-struct nft_nat {
-	enum nft_registers	sreg_addr_min:8;
-	enum nft_registers	sreg_addr_max:8;
-	enum nft_registers	sreg_proto_min:8;
-	enum nft_registers	sreg_proto_max:8;
-	enum nf_nat_manip_type	type;
-};
-
-static void nft_nat_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
-		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
-	}
-
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = data[priv->sreg_proto_min].data[0];
-		range.max_proto.all = data[priv->sreg_proto_max].data[0];
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	data[NFT_REG_VERDICT].verdict =
-		nf_nat_setup_info(ct, &range, priv->type);
-}
-
-static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
-	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
-};
-
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_nat *priv = nft_expr_priv(expr);
-	int err;
-
-	if (tb[NFTA_NAT_TYPE] == NULL)
-		return -EINVAL;
-
-	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
-	case NFT_NAT_SNAT:
-		priv->type = NF_NAT_MANIP_SRC;
-		break;
-	case NFT_NAT_DNAT:
-		priv->type = NF_NAT_MANIP_DST;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MIN]) {
-		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
-		err = nft_validate_input_register(priv->sreg_addr_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MAX]) {
-		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
-		err = nft_validate_input_register(priv->sreg_addr_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_addr_max = priv->sreg_addr_min;
-
-	if (tb[NFTA_NAT_PROTO_MIN]) {
-		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
-		err = nft_validate_input_register(priv->sreg_proto_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_PROTO_MAX]) {
-		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
-		err = nft_validate_input_register(priv->sreg_proto_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_proto_max = priv->sreg_proto_min;
-
-	return 0;
-}
-
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-
-	switch (priv->type) {
-	case NF_NAT_MANIP_SRC:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
-			goto nla_put_failure;
-		break;
-	case NF_NAT_MANIP_DST:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
-			goto nla_put_failure;
-		break;
-	}
-
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_type nft_nat_type;
-static const struct nft_expr_ops nft_nat_ops = {
-	.type		= &nft_nat_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.eval		= nft_nat_eval,
-	.init		= nft_nat_init,
-	.dump		= nft_nat_dump,
-};
-
-static struct nft_expr_type nft_nat_type __read_mostly = {
-	.name		= "nat",
-	.ops		= &nft_nat_ops,
-	.policy		= nft_nat_policy,
-	.maxattr	= NFTA_NAT_MAX,
-	.owner		= THIS_MODULE,
-};
-
 /*
  * NAT chains
  */
@@ -306,7 +164,7 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-struct nf_chain_type nft_chain_nat_ipv4 = {
+static struct nf_chain_type nft_chain_nat_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
@@ -331,20 +189,11 @@ static int __init nft_chain_nat_init(void)
 	if (err < 0)
 		return err;
 
-	err = nft_register_expr(&nft_nat_type);
-	if (err < 0)
-		goto err;
-
 	return 0;
-
-err:
-	nft_unregister_chain_type(&nft_chain_nat_ipv4);
-	return err;
 }
 
 static void __exit nft_chain_nat_exit(void)
 {
-	nft_unregister_expr(&nft_nat_type);
 	nft_unregister_chain_type(&nft_chain_nat_ipv4);
 }
 
@@ -354,4 +203,3 @@ module_exit(nft_chain_nat_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
-MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 23833064b7b5..7702f9e90a04 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -33,6 +33,11 @@ config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
 	tristate "IPv6 nf_tables route chain support"
 
+config NFT_CHAIN_NAT_IPV6
+	depends on NF_TABLES_IPV6
+	depends on NF_NAT_IPV6 && NFT_NAT
+	tristate "IPv6 nf_tables nat chain support"
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index be4913aa524d..d1b4928f34f7 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
new file mode 100644
index 000000000000..e86dcd70dc76
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ipv6.h>
+
+/*
+ * IPv6 NAT chains
+ */
+
+static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	__be16 frag_off;
+	int hdrlen;
+	u8 nexthdr;
+	struct nft_pktinfo pkt;
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		nexthdr = ipv6_hdr(skb)->nexthdr;
+		hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+					  &nexthdr, &frag_off);
+
+		if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+			if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum,
+							   hdrlen))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+				      &ct->tuplehash[!dir].tuple.dst.u3) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all))
+			if (nf_xfrm_me_harder(skb, AF_INET6) < 0)
+				ret = NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+				      &ct->tuplehash[!dir].tuple.src.u3)) {
+			if (ip6_route_me_harder(skb))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET6))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_nat_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_ipv6_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_ipv6_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_ipv6_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_ipv6_fn,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_nat_ipv6_init(void)
+{
+	int err;
+
+	err = nft_register_chain_type(&nft_chain_nat_ipv6);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_chain_nat_ipv6_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_nat_ipv6);
+}
+
+module_init(nft_chain_nat_ipv6_init);
+module_exit(nft_chain_nat_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 49e362707379..48acec17e27a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,12 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_NAT
+	depends on NF_TABLES
+	depends on NF_CONNTRACK
+	depends on NF_NAT
+	tristate "Netfilter nf_tables nat module"
+
 config NFT_COMPAT
 	depends on NF_TABLES
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a6781450b6fb..394483b2c193 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
+obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 #nf_tables-objs			+= nft_meta_target.o
 obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o
 obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
new file mode 100644
index 000000000000..b0b87b2d2411
--- /dev/null
+++ b/net/netfilter/nft_nat.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers      sreg_addr_min:8;
+	enum nft_registers      sreg_addr_max:8;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
+	int                     family;
+	enum nf_nat_manip_type  type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		if (priv->family == AF_INET) {
+			range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+			range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+
+		} else {
+			memcpy(range.min_addr.ip6,
+			       data[priv->sreg_addr_min].data,
+			       sizeof(struct nft_data));
+			memcpy(range.max_addr.ip6,
+			       data[priv->sreg_addr_max].data,
+			       sizeof(struct nft_data));
+		}
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_TYPE]		 = { .type = NLA_U32 },
+	[NFTA_NAT_FAMILY]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MIN]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MAX]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_FAMILY] == NULL)
+		return -EINVAL;
+
+	priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+	if (priv->family != AF_INET && priv->family != AF_INET6)
+		return -EINVAL;
+
+	if (tb[NFTA_NAT_REG_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_REG_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type           = &nft_nat_type,
+	.size           = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.eval           = nft_nat_eval,
+	.init           = nft_nat_init,
+	.dump           = nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name           = "nat",
+	.ops            = &nft_nat_ops,
+	.policy         = nft_nat_policy,
+	.maxattr        = NFTA_NAT_MAX,
+	.owner          = THIS_MODULE,
+};
+
+static int __init nft_nat_module_init(void)
+{
+	int err;
+
+	err = nft_register_expr(&nft_nat_type);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_nat_module_exit(void)
+{
+	nft_unregister_expr(&nft_nat_type);
+}
+
+module_init(nft_nat_module_init);
+module_exit(nft_nat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_EXPR("nat");
-- 
cgit v1.2.3


From 5e94846686d027a4c8ecc5d9d52b18036d3e8f7a Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@regit.org>
Date: Thu, 10 Oct 2013 13:41:44 +0200
Subject: netfilter: nf_tables: add insert operation

This patch adds a new rule attribute NFTA_RULE_POSITION which is
used to store the position of a rule relatively to the others.
By providing the create command and specifying the position, the
rule is inserted after the rule with the handle equal to the
provided position.

Regarding notification, the position attribute specifies the
handle of the previous rule to make sure we don't point to any
stale rule in notifications coming from the commit path.

This patch includes the following fix from Pablo:

* nf_tables: fix rule deletion event reporting

Signed-off-by: Eric Leblond <eric@regit.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c            | 38 +++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7d4a1992f89c..fbfd229a8e99 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -153,6 +153,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
  * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
  * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
+ * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -161,6 +162,7 @@ enum nft_rule_attributes {
 	NFTA_RULE_HANDLE,
 	NFTA_RULE_EXPRESSIONS,
 	NFTA_RULE_COMPAT,
+	NFTA_RULE_POSITION,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e1ee85047ec1..0f140663ec71 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1273,6 +1273,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
 	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
 	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
 	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED },
+	[NFTA_RULE_POSITION]	= { .type = NLA_U64 },
 };
 
 static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1285,9 +1286,10 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
 	struct nfgenmsg *nfmsg;
 	const struct nft_expr *expr, *next;
 	struct nlattr *list;
+	const struct nft_rule *prule;
+	int type = event | NFNL_SUBSYS_NFTABLES << 8;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg),
 			flags);
 	if (nlh == NULL)
 		goto nla_put_failure;
@@ -1304,6 +1306,13 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
 	if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
 		goto nla_put_failure;
 
+	if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
+		prule = list_entry(rule->list.prev, struct nft_rule, list);
+		if (nla_put_be64(skb, NFTA_RULE_POSITION,
+				 cpu_to_be64(prule->handle)))
+			goto nla_put_failure;
+	}
+
 	list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
 	if (list == NULL)
 		goto nla_put_failure;
@@ -1499,7 +1508,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	unsigned int size, i, n;
 	int err, rem;
 	bool create;
-	u64 handle;
+	u64 handle, pos_handle;
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
@@ -1533,6 +1542,16 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		handle = nf_tables_alloc_handle(table);
 	}
 
+	if (nla[NFTA_RULE_POSITION]) {
+		if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+			return -EOPNOTSUPP;
+
+		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
+		old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+		if (IS_ERR(old_rule))
+			return PTR_ERR(old_rule);
+	}
+
 	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
 
 	n = 0;
@@ -1573,9 +1592,16 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		list_replace_rcu(&old_rule->list, &rule->list);
 		nf_tables_rule_destroy(old_rule);
 	} else if (nlh->nlmsg_flags & NLM_F_APPEND)
-		list_add_tail_rcu(&rule->list, &chain->rules);
-	else
-		list_add_rcu(&rule->list, &chain->rules);
+		if (old_rule)
+			list_add_rcu(&rule->list, &old_rule->list);
+		else
+			list_add_tail_rcu(&rule->list, &chain->rules);
+	else {
+		if (old_rule)
+			list_add_tail_rcu(&rule->list, &old_rule->list);
+		else
+			list_add_rcu(&rule->list, &chain->rules);
+	}
 
 	nf_tables_rule_notify(skb, nlh, table, chain, rule, NFT_MSG_NEWRULE,
 			      nlh->nlmsg_flags & (NLM_F_APPEND | NLM_F_REPLACE),
-- 
cgit v1.2.3


From 0628b123c96d126e617beb3b4fd63b874d0e4f17 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 14 Oct 2013 11:05:33 +0200
Subject: netfilter: nfnetlink: add batch support and use it from nf_tables

This patch adds a batch support to nfnetlink. Basically, it adds
two new control messages:

* NFNL_MSG_BATCH_BEGIN, that indicates the beginning of a batch,
  the nfgenmsg->res_id indicates the nfnetlink subsystem ID.

* NFNL_MSG_BATCH_END, that results in the invocation of the
  ss->commit callback function. If not specified or an error
  ocurred in the batch, the ss->abort function is invoked
  instead.

The end message represents the commit operation in nftables, the
lack of end message results in an abort. This patch also adds the
.call_batch function that is only called from the batch receival
path.

This patch adds atomic rule updates and dumps based on
bitmask generations. This allows to atomically commit a set of
rule-set updates incrementally without altering the internal
state of existing nf_tables expressions/matches/targets.

The idea consists of using a generation cursor of 1 bit and
a bitmask of 2 bits per rule. Assuming the gencursor is 0,
then the genmask (expressed as a bitmask) can be interpreted
as:

00 active in the present, will be active in the next generation.
01 inactive in the present, will be active in the next generation.
10 active in the present, will be deleted in the next generation.
 ^
 gencursor

Once you invoke the transition to the next generation, the global
gencursor is updated:

00 active in the present, will be active in the next generation.
01 active in the present, needs to zero its future, it becomes 00.
10 inactive in the present, delete now.
^
gencursor

If a dump is in progress and nf_tables enters a new generation,
the dump will stop and return -EBUSY to let userspace know that
it has to retry again. In order to invalidate dumps, a global
genctr counter is increased everytime nf_tables enters a new
generation.

This new operation can be used from the user-space utility
that controls the firewall, eg.

nft -f restore

The rule updates contained in `file' will be applied atomically.

cat file
-----
add filter INPUT ip saddr 1.1.1.1 counter accept #1
del filter INPUT ip daddr 2.2.2.2 counter drop   #2
-EOF-

Note that the rule 1 will be inactive until the transition to the
next generation, the rule 2 will be evicted in the next generation.

There is a penalty during the rule update due to the branch
misprediction in the packet matching framework. But that should be
quickly resolved once the iteration over the commit list that
contain rules that require updates is finished.

Event notification happens once the rule-set update has been
committed. So we skip notifications is case the rule-set update
is aborted, which can happen in case that the rule-set is tested
to apply correctly.

This patch squashed the following patches from Pablo:

* nf_tables: atomic rule updates and dumps
* nf_tables: get rid of per rule list_head for commits
* nf_tables: use per netns commit list
* nfnetlink: add batch support and use it from nf_tables
* nf_tables: all rule updates are transactional
* nf_tables: attach replacement rule after stale one
* nf_tables: do not allow deletion/replacement of stale rules
* nf_tables: remove unused NFTA_RULE_FLAGS

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h      |   5 +
 include/net/netfilter/nf_tables.h        |  25 +++-
 include/net/netns/nftables.h             |   3 +
 include/uapi/linux/netfilter/nfnetlink.h |   4 +
 net/netfilter/nf_tables_api.c            | 202 ++++++++++++++++++++++++++++---
 net/netfilter/nf_tables_core.c           |  10 ++
 net/netfilter/nfnetlink.c                | 175 +++++++++++++++++++++++++-
 7 files changed, 401 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 4f68cd7141d2..28c74367e900 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -14,6 +14,9 @@ struct nfnl_callback {
 	int (*call_rcu)(struct sock *nl, struct sk_buff *skb, 
 		    const struct nlmsghdr *nlh,
 		    const struct nlattr * const cda[]);
+	int (*call_batch)(struct sock *nl, struct sk_buff *skb,
+			  const struct nlmsghdr *nlh,
+			  const struct nlattr * const cda[]);
 	const struct nla_policy *policy;	/* netlink attribute policy */
 	const u_int16_t attr_count;		/* number of nlattr's */
 };
@@ -23,6 +26,8 @@ struct nfnetlink_subsystem {
 	__u8 subsys_id;			/* nfnetlink subsystem ID */
 	__u8 cb_count;			/* number of callbacks */
 	const struct nfnl_callback *cb;	/* callback for individual types */
+	int (*commit)(struct sk_buff *skb);
+	int (*abort)(struct sk_buff *skb);
 };
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d3272e943aac..975ad3c573c7 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -323,18 +323,39 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
  *	@list: used internally
  *	@rcu_head: used internally for rcu
  *	@handle: rule handle
+ *	@genmask: generation mask
  *	@dlen: length of expression data
  *	@data: expression data
  */
 struct nft_rule {
 	struct list_head		list;
 	struct rcu_head			rcu_head;
-	u64				handle:48,
+	u64				handle:46,
+					genmask:2,
 					dlen:16;
 	unsigned char			data[]
 		__attribute__((aligned(__alignof__(struct nft_expr))));
 };
 
+/**
+ *	struct nft_rule_trans - nf_tables rule update in transaction
+ *
+ *	@list: used internally
+ *	@rule: rule that needs to be updated
+ *	@chain: chain that this rule belongs to
+ *	@table: table for which this chain applies
+ *	@nlh: netlink header of the message that contain this update
+ *	@family: family expressesed as AF_*
+ */
+struct nft_rule_trans {
+	struct list_head		list;
+	struct nft_rule			*rule;
+	const struct nft_chain		*chain;
+	const struct nft_table		*table;
+	const struct nlmsghdr		*nlh;
+	u8				family;
+};
+
 static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
 {
 	return (struct nft_expr *)&rule->data[0];
@@ -370,6 +391,7 @@ enum nft_chain_flags {
  *	@rules: list of rules in the chain
  *	@list: used internally
  *	@rcu_head: used internally
+ *	@net: net namespace that this chain belongs to
  *	@handle: chain handle
  *	@flags: bitmask of enum nft_chain_flags
  *	@use: number of jump references to this chain
@@ -380,6 +402,7 @@ struct nft_chain {
 	struct list_head		rules;
 	struct list_head		list;
 	struct rcu_head			rcu_head;
+	struct net			*net;
 	u64				handle;
 	u8				flags;
 	u16				use;
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index a98b1c5d9913..08a4248a12b5 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -7,9 +7,12 @@ struct nft_af_info;
 
 struct netns_nftables {
 	struct list_head	af_info;
+	struct list_head	commit_list;
 	struct nft_af_info	*ipv4;
 	struct nft_af_info	*ipv6;
 	struct nft_af_info	*bridge;
+	u8			gencursor;
+	u8			genctr;
 };
 
 #endif
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 288959404d54..596ddd45253c 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -57,4 +57,8 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_NFT_COMPAT		11
 #define NFNL_SUBSYS_COUNT		12
 
+/* Reserved control nfnetlink messages */
+#define NFNL_MSG_BATCH_BEGIN		NLMSG_MIN_TYPE
+#define NFNL_MSG_BATCH_END		NLMSG_MIN_TYPE+1
+
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0f140663ec71..79e1418a6043 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -978,6 +978,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	INIT_LIST_HEAD(&chain->rules);
 	chain->handle = nf_tables_alloc_handle(table);
+	chain->net = net;
 	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
 	if (!(table->flags & NFT_TABLE_F_DORMANT) &&
@@ -1371,6 +1372,41 @@ err:
 	return err;
 }
 
+static inline bool
+nft_rule_is_active(struct net *net, const struct nft_rule *rule)
+{
+	return (rule->genmask & (1 << net->nft.gencursor)) == 0;
+}
+
+static inline int gencursor_next(struct net *net)
+{
+	return net->nft.gencursor+1 == 1 ? 1 : 0;
+}
+
+static inline int
+nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
+{
+	return (rule->genmask & (1 << gencursor_next(net))) == 0;
+}
+
+static inline void
+nft_rule_activate_next(struct net *net, struct nft_rule *rule)
+{
+	/* Now inactive, will be active in the future */
+	rule->genmask = (1 << net->nft.gencursor);
+}
+
+static inline void
+nft_rule_disactivate_next(struct net *net, struct nft_rule *rule)
+{
+	rule->genmask = (1 << gencursor_next(net));
+}
+
+static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
+{
+	rule->genmask = 0;
+}
+
 static int nf_tables_dump_rules(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
@@ -1382,6 +1418,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	unsigned int idx = 0, s_idx = cb->args[0];
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
+	u8 genctr = ACCESS_ONCE(net->nft.genctr);
+	u8 gencursor = ACCESS_ONCE(net->nft.gencursor);
 
 	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
@@ -1390,6 +1428,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 		list_for_each_entry(table, &afi->tables, list) {
 			list_for_each_entry(chain, &table->chains, list) {
 				list_for_each_entry(rule, &chain->rules, list) {
+					if (!nft_rule_is_active(net, rule))
+						goto cont;
 					if (idx < s_idx)
 						goto cont;
 					if (idx > s_idx)
@@ -1408,6 +1448,10 @@ cont:
 		}
 	}
 done:
+	/* Invalidate this dump, a transition to the new generation happened */
+	if (gencursor != net->nft.gencursor || genctr != net->nft.genctr)
+		return -EBUSY;
+
 	cb->args[0] = idx;
 	return skb->len;
 }
@@ -1492,6 +1536,25 @@ static void nf_tables_rule_destroy(struct nft_rule *rule)
 
 static struct nft_expr_info *info;
 
+static struct nft_rule_trans *
+nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx)
+{
+	struct nft_rule_trans *rupd;
+
+	rupd = kmalloc(sizeof(struct nft_rule_trans), GFP_KERNEL);
+	if (rupd == NULL)
+	       return NULL;
+
+	rupd->chain = ctx->chain;
+	rupd->table = ctx->table;
+	rupd->rule = rule;
+	rupd->family = ctx->afi->family;
+	rupd->nlh = ctx->nlh;
+	list_add_tail(&rupd->list, &ctx->net->nft.commit_list);
+
+	return rupd;
+}
+
 static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 			     const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[])
@@ -1502,6 +1565,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *old_rule = NULL;
+	struct nft_rule_trans *repl = NULL;
 	struct nft_expr *expr;
 	struct nft_ctx ctx;
 	struct nlattr *tmp;
@@ -1576,6 +1640,8 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	if (rule == NULL)
 		goto err1;
 
+	nft_rule_activate_next(net, rule);
+
 	rule->handle = handle;
 	rule->dlen   = size;
 
@@ -1589,8 +1655,18 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
-		list_replace_rcu(&old_rule->list, &rule->list);
-		nf_tables_rule_destroy(old_rule);
+		if (nft_rule_is_active_next(net, old_rule)) {
+			repl = nf_tables_trans_add(old_rule, &ctx);
+			if (repl == NULL) {
+				err = -ENOMEM;
+				goto err2;
+			}
+			nft_rule_disactivate_next(net, old_rule);
+			list_add_tail(&rule->list, &old_rule->list);
+		} else {
+			err = -ENOENT;
+			goto err2;
+		}
 	} else if (nlh->nlmsg_flags & NLM_F_APPEND)
 		if (old_rule)
 			list_add_rcu(&rule->list, &old_rule->list);
@@ -1603,11 +1679,20 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 			list_add_rcu(&rule->list, &chain->rules);
 	}
 
-	nf_tables_rule_notify(skb, nlh, table, chain, rule, NFT_MSG_NEWRULE,
-			      nlh->nlmsg_flags & (NLM_F_APPEND | NLM_F_REPLACE),
-			      nfmsg->nfgen_family);
+	if (nf_tables_trans_add(rule, &ctx) == NULL) {
+		err = -ENOMEM;
+		goto err3;
+	}
 	return 0;
 
+err3:
+	list_del_rcu(&rule->list);
+	if (repl) {
+		list_del_rcu(&repl->rule->list);
+		list_del(&repl->list);
+		nft_rule_clear(net, repl->rule);
+		kfree(repl);
+	}
 err2:
 	nf_tables_rule_destroy(rule);
 err1:
@@ -1618,6 +1703,19 @@ err1:
 	return err;
 }
 
+static int
+nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule)
+{
+	/* You cannot delete the same rule twice */
+	if (nft_rule_is_active_next(ctx->net, rule)) {
+		if (nf_tables_trans_add(rule, ctx) == NULL)
+			return -ENOMEM;
+		nft_rule_disactivate_next(ctx->net, rule);
+		return 0;
+	}
+	return -ENOENT;
+}
+
 static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 			     const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[])
@@ -1628,7 +1726,8 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *tmp;
-	int family = nfmsg->nfgen_family;
+	int family = nfmsg->nfgen_family, err = 0;
+	struct nft_ctx ctx;
 
 	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
@@ -1642,31 +1741,95 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
+	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
 	if (nla[NFTA_RULE_HANDLE]) {
 		rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 		if (IS_ERR(rule))
 			return PTR_ERR(rule);
 
-		/* List removal must be visible before destroying expressions */
-		list_del_rcu(&rule->list);
-
-		nf_tables_rule_notify(skb, nlh, table, chain, rule,
-				      NFT_MSG_DELRULE, 0, family);
-		nf_tables_rule_destroy(rule);
+		err = nf_tables_delrule_one(&ctx, rule);
 	} else {
 		/* Remove all rules in this chain */
 		list_for_each_entry_safe(rule, tmp, &chain->rules, list) {
-			list_del_rcu(&rule->list);
+			err = nf_tables_delrule_one(&ctx, rule);
+			if (err < 0)
+				break;
+		}
+	}
+
+	return err;
+}
+
+static int nf_tables_commit(struct sk_buff *skb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nft_rule_trans *rupd, *tmp;
 
-			nf_tables_rule_notify(skb, nlh, table, chain, rule,
-					      NFT_MSG_DELRULE, 0, family);
-			nf_tables_rule_destroy(rule);
+	/* Bump generation counter, invalidate any dump in progress */
+	net->nft.genctr++;
+
+	/* A new generation has just started */
+	net->nft.gencursor = gencursor_next(net);
+
+	/* Make sure all packets have left the previous generation before
+	 * purging old rules.
+	 */
+	synchronize_rcu();
+
+	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
+		/* Delete this rule from the dirty list */
+		list_del(&rupd->list);
+
+		/* This rule was inactive in the past and just became active.
+		 * Clear the next bit of the genmask since its meaning has
+		 * changed, now it is the future.
+		 */
+		if (nft_rule_is_active(net, rupd->rule)) {
+			nft_rule_clear(net, rupd->rule);
+			nf_tables_rule_notify(skb, rupd->nlh, rupd->table,
+					      rupd->chain, rupd->rule,
+					      NFT_MSG_NEWRULE, 0,
+					      rupd->family);
+			kfree(rupd);
+			continue;
 		}
+
+		/* This rule is in the past, get rid of it */
+		list_del_rcu(&rupd->rule->list);
+		nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain,
+				      rupd->rule, NFT_MSG_DELRULE, 0,
+				      rupd->family);
+		nf_tables_rule_destroy(rupd->rule);
+		kfree(rupd);
 	}
 
 	return 0;
 }
 
+static int nf_tables_abort(struct sk_buff *skb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nft_rule_trans *rupd, *tmp;
+
+	list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
+		/* Delete all rules from the dirty list */
+		list_del(&rupd->list);
+
+		if (!nft_rule_is_active_next(net, rupd->rule)) {
+			nft_rule_clear(net, rupd->rule);
+			kfree(rupd);
+			continue;
+		}
+
+		/* This rule is inactive, get rid of it */
+		list_del_rcu(&rupd->rule->list);
+		nf_tables_rule_destroy(rupd->rule);
+		kfree(rupd);
+	}
+	return 0;
+}
+
 /*
  * Sets
  */
@@ -2634,7 +2797,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_chain_policy,
 	},
 	[NFT_MSG_NEWRULE] = {
-		.call		= nf_tables_newrule,
+		.call_batch	= nf_tables_newrule,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
@@ -2644,7 +2807,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_DELRULE] = {
-		.call		= nf_tables_delrule,
+		.call_batch	= nf_tables_delrule,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
@@ -2685,6 +2848,8 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.subsys_id	= NFNL_SUBSYS_NFTABLES,
 	.cb_count	= NFT_MSG_MAX,
 	.cb		= nf_tables_cb,
+	.commit		= nf_tables_commit,
+	.abort		= nf_tables_abort,
 };
 
 /*
@@ -3056,6 +3221,7 @@ EXPORT_SYMBOL_GPL(nft_data_dump);
 static int nf_tables_init_net(struct net *net)
 {
 	INIT_LIST_HEAD(&net->nft.af_info);
+	INIT_LIST_HEAD(&net->nft.commit_list);
 	return 0;
 }
 
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 3c13007d80df..d581ef660248 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -88,12 +88,22 @@ nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
 	struct nft_data data[NFT_REG_MAX + 1];
 	unsigned int stackptr = 0;
 	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
+	/*
+	 * Cache cursor to avoid problems in case that the cursor is updated
+	 * while traversing the ruleset.
+	 */
+	unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor);
 
 do_chain:
 	rule = list_entry(&chain->rules, struct nft_rule, list);
 next_rule:
 	data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
 	list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+
+		/* This rule is not active, skip. */
+		if (unlikely(rule->genmask & (1 << gencursor)))
+			continue;
+
 		nft_rule_for_each_expr(expr, last, rule) {
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, data);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 572d87dc116f..027f16af51a0 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -147,9 +147,6 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	const struct nfnetlink_subsystem *ss;
 	int type, err;
 
-	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-		return -EPERM;
-
 	/* All the messages must at least contain nfgenmsg */
 	if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
 		return 0;
@@ -217,9 +214,179 @@ replay:
 	}
 }
 
+static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
+				u_int16_t subsys_id)
+{
+	struct sk_buff *nskb, *oskb = skb;
+	struct net *net = sock_net(skb->sk);
+	const struct nfnetlink_subsystem *ss;
+	const struct nfnl_callback *nc;
+	bool success = true, done = false;
+	int err;
+
+	if (subsys_id >= NFNL_SUBSYS_COUNT)
+		return netlink_ack(skb, nlh, -EINVAL);
+replay:
+	nskb = netlink_skb_clone(oskb, GFP_KERNEL);
+	if (!nskb)
+		return netlink_ack(oskb, nlh, -ENOMEM);
+
+	nskb->sk = oskb->sk;
+	skb = nskb;
+
+	nfnl_lock(subsys_id);
+	ss = rcu_dereference_protected(table[subsys_id].subsys,
+				       lockdep_is_held(&table[subsys_id].mutex));
+	if (!ss) {
+#ifdef CONFIG_MODULES
+		nfnl_unlock(subsys_id);
+		request_module("nfnetlink-subsys-%d", subsys_id);
+		nfnl_lock(subsys_id);
+		ss = rcu_dereference_protected(table[subsys_id].subsys,
+					       lockdep_is_held(&table[subsys_id].mutex));
+		if (!ss)
+#endif
+		{
+			nfnl_unlock(subsys_id);
+			kfree_skb(nskb);
+			return netlink_ack(skb, nlh, -EOPNOTSUPP);
+		}
+	}
+
+	if (!ss->commit || !ss->abort) {
+		nfnl_unlock(subsys_id);
+		kfree_skb(nskb);
+		return netlink_ack(skb, nlh, -EOPNOTSUPP);
+	}
+
+	while (skb->len >= nlmsg_total_size(0)) {
+		int msglen, type;
+
+		nlh = nlmsg_hdr(skb);
+		err = 0;
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN) {
+			err = -EINVAL;
+			goto ack;
+		}
+
+		/* Only requests are handled by the kernel */
+		if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+			err = -EINVAL;
+			goto ack;
+		}
+
+		type = nlh->nlmsg_type;
+		if (type == NFNL_MSG_BATCH_BEGIN) {
+			/* Malformed: Batch begin twice */
+			success = false;
+			goto done;
+		} else if (type == NFNL_MSG_BATCH_END) {
+			done = true;
+			goto done;
+		} else if (type < NLMSG_MIN_TYPE) {
+			err = -EINVAL;
+			goto ack;
+		}
+
+		/* We only accept a batch with messages for the same
+		 * subsystem.
+		 */
+		if (NFNL_SUBSYS_ID(type) != subsys_id) {
+			err = -EINVAL;
+			goto ack;
+		}
+
+		nc = nfnetlink_find_client(type, ss);
+		if (!nc) {
+			err = -EINVAL;
+			goto ack;
+		}
+
+		{
+			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+			u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+			struct nlattr *attr = (void *)nlh + min_len;
+			int attrlen = nlh->nlmsg_len - min_len;
+
+			err = nla_parse(cda, ss->cb[cb_id].attr_count,
+					attr, attrlen, ss->cb[cb_id].policy);
+			if (err < 0)
+				goto ack;
+
+			if (nc->call_batch) {
+				err = nc->call_batch(net->nfnl, skb, nlh,
+						     (const struct nlattr **)cda);
+			}
+
+			/* The lock was released to autoload some module, we
+			 * have to abort and start from scratch using the
+			 * original skb.
+			 */
+			if (err == -EAGAIN) {
+				ss->abort(skb);
+				nfnl_unlock(subsys_id);
+				kfree_skb(nskb);
+				goto replay;
+			}
+		}
+ack:
+		if (nlh->nlmsg_flags & NLM_F_ACK || err) {
+			/* We don't stop processing the batch on errors, thus,
+			 * userspace gets all the errors that the batch
+			 * triggers.
+			 */
+			netlink_ack(skb, nlh, err);
+			if (err)
+				success = false;
+		}
+
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
+	}
+done:
+	if (success && done)
+		ss->commit(skb);
+	else
+		ss->abort(skb);
+
+	nfnl_unlock(subsys_id);
+	kfree_skb(nskb);
+}
+
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
-	netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	struct net *net = sock_net(skb->sk);
+	int msglen;
+
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		return netlink_ack(skb, nlh, -EPERM);
+
+	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+	    skb->len < nlh->nlmsg_len)
+		return;
+
+	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
+		struct nfgenmsg *nfgenmsg;
+
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+		    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+			return;
+
+		nfgenmsg = nlmsg_data(nlh);
+		skb_pull(skb, msglen);
+		nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+	} else {
+		netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+	}
 }
 
 #ifdef CONFIG_MODULES
-- 
cgit v1.2.3


From 120c9794a3ee2f9b1548a1b0b252652e3c134f59 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 28 Aug 2013 19:31:22 +0300
Subject: ipvs: fix the IPVS_CMD_ATTR_MAX definition

It was wrong (bigger) but problem is harmless.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/uapi/linux/ip_vs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 29458223d044..fbcffe8041f7 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -334,7 +334,7 @@ enum {
 	__IPVS_CMD_ATTR_MAX,
 };
 
-#define IPVS_CMD_ATTR_MAX (__IPVS_SVC_ATTR_MAX - 1)
+#define IPVS_CMD_ATTR_MAX (__IPVS_CMD_ATTR_MAX - 1)
 
 /*
  * Attributes used to describe a service
-- 
cgit v1.2.3


From cbbc58d4fdfab1a39a6ac1b41fcb17885952157a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 7 Oct 2013 22:18:01 +0530
Subject: kvm: powerpc: book3s: Allow the HV and PR selection per virtual
 machine

This moves the kvmppc_ops callbacks to be a per VM entity. This
enables us to select HV and PR mode when creating a VM. We also
allow both kvm-hv and kvm-pr kernel module to be loaded. To
achieve this we move /dev/kvm ownership to kvm.ko module. Depending on
which KVM mode we select during VM creation we take a reference
count on respective module

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[agraf: fix coding style]
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/include/asm/kvm_ppc.h  |  7 +--
 arch/powerpc/kvm/44x.c              |  7 ++-
 arch/powerpc/kvm/book3s.c           | 89 +++++++++++++++++++++++++------------
 arch/powerpc/kvm/book3s.h           |  2 +
 arch/powerpc/kvm/book3s_hv.c        | 18 ++++----
 arch/powerpc/kvm/book3s_pr.c        | 25 +++++++----
 arch/powerpc/kvm/book3s_xics.c      |  2 +-
 arch/powerpc/kvm/booke.c            | 22 ++++-----
 arch/powerpc/kvm/e500.c             |  8 +++-
 arch/powerpc/kvm/e500mc.c           |  6 ++-
 arch/powerpc/kvm/emulate.c          | 11 ++---
 arch/powerpc/kvm/powerpc.c          | 76 ++++++++++++++++++++++---------
 include/uapi/linux/kvm.h            |  4 ++
 14 files changed, 187 insertions(+), 91 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 61ce4dca45d3..237d1d25b448 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -270,6 +270,7 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
 #endif
+	struct kvmppc_ops *kvm_ops;
 };
 
 /*
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 20f461637090..3069cf4dcc88 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -182,6 +182,7 @@ union kvmppc_one_reg {
 };
 
 struct kvmppc_ops {
+	struct module *owner;
 	bool is_hv_enabled;
 	int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 	int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
@@ -217,7 +218,6 @@ struct kvmppc_ops {
 			      unsigned long npages);
 	int (*init_vm)(struct kvm *kvm);
 	void (*destroy_vm)(struct kvm *kvm);
-	int (*check_processor_compat)(void);
 	int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info);
 	int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			  unsigned int inst, int *advance);
@@ -229,7 +229,8 @@ struct kvmppc_ops {
 
 };
 
-extern struct kvmppc_ops *kvmppc_ops;
+extern struct kvmppc_ops *kvmppc_hv_ops;
+extern struct kvmppc_ops *kvmppc_pr_ops;
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -326,7 +327,7 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->fast_vcpu_kick(vcpu);
+	vcpu->kvm->arch.kvm_ops->fast_vcpu_kick(vcpu);
 }
 
 #else
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index a765bcd74fbb..93221e87b911 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -213,16 +213,19 @@ static int __init kvmppc_44x_init(void)
 	if (r)
 		goto err_out;
 
-	r = kvm_init(&kvm_ops_44x, sizeof(struct kvmppc_vcpu_44x),
-		     0, THIS_MODULE);
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
 	if (r)
 		goto err_out;
+	kvm_ops_44x.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_44x;
+
 err_out:
 	return r;
 }
 
 static void __exit kvmppc_44x_exit(void)
 {
+	kvmppc_pr_ops = NULL;
 	kvmppc_booke_exit();
 }
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 130fe1d75bac..ad8f6ed3f136 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -34,6 +34,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "book3s.h"
 #include "trace.h"
 
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -71,7 +72,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
-	if (!kvmppc_ops->is_hv_enabled)
+	if (!vcpu->kvm->arch.kvm_ops->is_hv_enabled)
 		return to_book3s(vcpu)->hior;
 	return 0;
 }
@@ -79,7 +80,7 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 			unsigned long pending_now, unsigned long old_pending)
 {
-	if (kvmppc_ops->is_hv_enabled)
+	if (vcpu->kvm->arch.kvm_ops->is_hv_enabled)
 		return;
 	if (pending_now)
 		vcpu->arch.shared->int_pending = 1;
@@ -93,7 +94,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 	ulong crit_r1;
 	bool crit;
 
-	if (kvmppc_ops->is_hv_enabled)
+	if (vcpu->kvm->arch.kvm_ops->is_hv_enabled)
 		return false;
 
 	crit_raw = vcpu->arch.shared->critical;
@@ -477,13 +478,13 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	return kvmppc_ops->get_sregs(vcpu, sregs);
+	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	return kvmppc_ops->set_sregs(vcpu, sregs);
+	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -562,7 +563,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 	if (size > sizeof(val))
 		return -EINVAL;
 
-	r = kvmppc_ops->get_one_reg(vcpu, reg->id, &val);
+	r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val);
 	if (r == -EINVAL) {
 		r = 0;
 		switch (reg->id) {
@@ -641,7 +642,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
 		return -EFAULT;
 
-	r = kvmppc_ops->set_one_reg(vcpu, reg->id, &val);
+	r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val);
 	if (r == -EINVAL) {
 		r = 0;
 		switch (reg->id) {
@@ -702,23 +703,23 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	kvmppc_ops->vcpu_load(vcpu, cpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->vcpu_put(vcpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
-	kvmppc_ops->set_msr(vcpu, msr);
+	vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr);
 }
 EXPORT_SYMBOL_GPL(kvmppc_set_msr);
 
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	return kvmppc_ops->vcpu_run(kvm_run, vcpu);
+	return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -743,84 +744,84 @@ void kvmppc_decrementer_func(unsigned long data)
 
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	return kvmppc_ops->vcpu_create(kvm, id);
+	return kvm->arch.kvm_ops->vcpu_create(kvm, id);
 }
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->vcpu_free(vcpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
 }
 
 int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
 {
-	return kvmppc_ops->check_requests(vcpu);
+	return vcpu->kvm->arch.kvm_ops->check_requests(vcpu);
 }
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-	return kvmppc_ops->get_dirty_log(kvm, log);
+	return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
 }
 
 void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			      struct kvm_memory_slot *dont)
 {
-	kvmppc_ops->free_memslot(free, dont);
+	kvm->arch.kvm_ops->free_memslot(free, dont);
 }
 
 int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			       unsigned long npages)
 {
-	return kvmppc_ops->create_memslot(slot, npages);
+	return kvm->arch.kvm_ops->create_memslot(slot, npages);
 }
 
 void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-	kvmppc_ops->flush_memslot(kvm, memslot);
+	kvm->arch.kvm_ops->flush_memslot(kvm, memslot);
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem)
 {
-	return kvmppc_ops->prepare_memory_region(kvm, memslot, mem);
+	return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old)
 {
-	kvmppc_ops->commit_memory_region(kvm, mem, old);
+	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvmppc_ops->unmap_hva(kvm, hva);
+	return kvm->arch.kvm_ops->unmap_hva(kvm, hva);
 }
 EXPORT_SYMBOL_GPL(kvm_unmap_hva);
 
 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	return kvmppc_ops->unmap_hva_range(kvm, start, end);
+	return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvmppc_ops->age_hva(kvm, hva);
+	return kvm->arch.kvm_ops->age_hva(kvm, hva);
 }
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvmppc_ops->test_age_hva(kvm, hva);
+	return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
 }
 
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
-	kvmppc_ops->set_spte_hva(kvm, hva, pte);
+	kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
 }
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->mmu_destroy(vcpu);
+	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
@@ -831,12 +832,12 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
-	return kvmppc_ops->init_vm(kvm);
+	return kvm->arch.kvm_ops->init_vm(kvm);
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-	kvmppc_ops->destroy_vm(kvm);
+	kvm->arch.kvm_ops->destroy_vm(kvm);
 
 #ifdef CONFIG_PPC64
 	kvmppc_rtas_tokens_free(kvm);
@@ -846,5 +847,35 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 
 int kvmppc_core_check_processor_compat(void)
 {
-	return kvmppc_ops->check_processor_compat();
+	/*
+	 * We always return 0 for book3s. We check
+	 * for compatability while loading the HV
+	 * or PR module
+	 */
+	return 0;
+}
+
+static int kvmppc_book3s_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	if (r)
+		return r;
+#ifdef CONFIG_KVM_BOOK3S_32
+	r = kvmppc_book3s_init_pr();
+#endif
+	return r;
+
+}
+
+static void kvmppc_book3s_exit(void)
+{
+#ifdef CONFIG_KVM_BOOK3S_32
+	kvmppc_book3s_exit_pr();
+#endif
+	kvm_exit();
 }
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 9e5b3a341943..4bf956cf94d6 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -28,5 +28,7 @@ extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu,
 					int sprn, ulong spr_val);
 extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
 					int sprn, ulong *spr_val);
+extern int kvmppc_book3s_init_pr(void);
+extern void kvmppc_book3s_exit_pr(void);
 
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9e954a81c078..8743048881b7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2159,7 +2159,7 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 	return r;
 }
 
-static struct kvmppc_ops kvmppc_hv_ops = {
+static struct kvmppc_ops kvm_ops_hv = {
 	.is_hv_enabled = true,
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -2186,7 +2186,6 @@ static struct kvmppc_ops kvmppc_hv_ops = {
 	.create_memslot = kvmppc_core_create_memslot_hv,
 	.init_vm =  kvmppc_core_init_vm_hv,
 	.destroy_vm = kvmppc_core_destroy_vm_hv,
-	.check_processor_compat = kvmppc_core_check_processor_compat_hv,
 	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
 	.emulate_op = kvmppc_core_emulate_op_hv,
 	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
@@ -2198,20 +2197,23 @@ static struct kvmppc_ops kvmppc_hv_ops = {
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
-
-	r = kvm_init(&kvmppc_hv_ops, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-
-	if (r)
+	/*
+	 * FIXME!! Do we need to check on all cpus ?
+	 */
+	r = kvmppc_core_check_processor_compat_hv();
+	if (r < 0)
 		return r;
 
-	r = kvmppc_mmu_hv_init();
+	kvm_ops_hv.owner = THIS_MODULE;
+	kvmppc_hv_ops = &kvm_ops_hv;
 
+	r = kvmppc_mmu_hv_init();
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
-	kvm_exit();
+	kvmppc_hv_ops = NULL;
 }
 
 module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 7f583a482161..fbd985f0cb02 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1525,7 +1525,7 @@ static long kvm_arch_vm_ioctl_pr(struct file *filp,
 	return -ENOTTY;
 }
 
-static struct kvmppc_ops kvmppc_pr_ops = {
+static struct kvmppc_ops kvm_ops_pr = {
 	.is_hv_enabled = false,
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr,
@@ -1552,7 +1552,6 @@ static struct kvmppc_ops kvmppc_pr_ops = {
 	.create_memslot = kvmppc_core_create_memslot_pr,
 	.init_vm = kvmppc_core_init_vm_pr,
 	.destroy_vm = kvmppc_core_destroy_vm_pr,
-	.check_processor_compat = kvmppc_core_check_processor_compat_pr,
 	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr,
 	.emulate_op = kvmppc_core_emulate_op_pr,
 	.emulate_mtspr = kvmppc_core_emulate_mtspr_pr,
@@ -1561,27 +1560,35 @@ static struct kvmppc_ops kvmppc_pr_ops = {
 	.arch_vm_ioctl  = kvm_arch_vm_ioctl_pr,
 };
 
-static int kvmppc_book3s_init_pr(void)
+
+int kvmppc_book3s_init_pr(void)
 {
 	int r;
 
-	r = kvm_init(&kvmppc_pr_ops, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-
-	if (r)
+	r = kvmppc_core_check_processor_compat_pr();
+	if (r < 0)
 		return r;
 
-	r = kvmppc_mmu_hpte_sysinit();
+	kvm_ops_pr.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_pr;
 
+	r = kvmppc_mmu_hpte_sysinit();
 	return r;
 }
 
-static void kvmppc_book3s_exit_pr(void)
+void kvmppc_book3s_exit_pr(void)
 {
+	kvmppc_pr_ops = NULL;
 	kvmppc_mmu_hpte_sysexit();
-	kvm_exit();
 }
 
+/*
+ * We only support separate modules for book3s 64
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+
 module_init(kvmppc_book3s_init_pr);
 module_exit(kvmppc_book3s_exit_pr);
 
 MODULE_LICENSE("GPL");
+#endif
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index c3c832b27ee5..f7a5108a3483 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 	}
 
 	/* Check for real mode returning too hard */
-	if (xics->real_mode && kvmppc_ops->is_hv_enabled)
+	if (xics->real_mode && vcpu->kvm->arch.kvm_ops->is_hv_enabled)
 		return kvmppc_xics_rm_complete(vcpu, req);
 
 	switch (req) {
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index cb2d986a3382..15d0149511eb 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1472,7 +1472,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 	get_sregs_base(vcpu, sregs);
 	get_sregs_arch206(vcpu, sregs);
-	return kvmppc_ops->get_sregs(vcpu, sregs);
+	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -1491,7 +1491,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (ret < 0)
 		return ret;
 
-	return kvmppc_ops->set_sregs(vcpu, sregs);
+	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
 }
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
@@ -1548,7 +1548,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 		val = get_reg_val(reg->id, vcpu->arch.vrsave);
 		break;
 	default:
-		r = kvmppc_ops->get_one_reg(vcpu, reg->id, &val);
+		r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val);
 		break;
 	}
 
@@ -1631,7 +1631,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 		vcpu->arch.vrsave = set_reg_val(reg->id, val);
 		break;
 	default:
-		r = kvmppc_ops->set_one_reg(vcpu, reg->id, &val);
+		r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val);
 		break;
 	}
 
@@ -1911,37 +1911,37 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->mmu_destroy(vcpu);
+	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
-	return kvmppc_ops->init_vm(kvm);
+	return kvm->arch.kvm_ops->init_vm(kvm);
 }
 
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	return kvmppc_ops->vcpu_create(kvm, id);
+	return kvm->arch.kvm_ops->vcpu_create(kvm, id);
 }
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->vcpu_free(vcpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-	kvmppc_ops->destroy_vm(kvm);
+	kvm->arch.kvm_ops->destroy_vm(kvm);
 }
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	kvmppc_ops->vcpu_load(vcpu, cpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvmppc_ops->vcpu_put(vcpu);
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
 }
 
 int __init kvmppc_booke_init(void)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index d225d5ebddcc..497b142f651c 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -555,13 +555,19 @@ static int __init kvmppc_e500_init(void)
 	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
 			   ivor[max_ivor] + handler_len);
 
-	r = kvm_init(&kvm_ops_e500, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	if (r)
+		goto err_out;
+	kvm_ops_e500.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500;
+
 err_out:
 	return r;
 }
 
 static void __exit kvmppc_e500_exit(void)
 {
+	kvmppc_pr_ops = NULL;
 	kvmppc_booke_exit();
 }
 
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index db6a383401c7..4132cd2fc171 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -373,15 +373,19 @@ static int __init kvmppc_e500mc_init(void)
 	kvmppc_init_lpid(64);
 	kvmppc_claim_lpid(0); /* host */
 
-	r = kvm_init(&kvm_ops_e500mc, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
 	if (r)
 		goto err_out;
+	kvm_ops_e500mc.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500mc;
+
 err_out:
 	return r;
 }
 
 static void __exit kvmppc_e500mc_exit(void)
 {
+	kvmppc_pr_ops = NULL;
 	kvmppc_booke_exit();
 }
 
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index de9a340d22ed..2f9a0873b44f 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -130,8 +130,8 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_PIR: break;
 
 	default:
-		emulated = kvmppc_ops->emulate_mtspr(vcpu, sprn,
-						     spr_val);
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn,
+								  spr_val);
 		if (emulated == EMULATE_FAIL)
 			printk(KERN_INFO "mtspr: unknown spr "
 				"0x%x\n", sprn);
@@ -191,8 +191,8 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		spr_val = kvmppc_get_dec(vcpu, get_tb());
 		break;
 	default:
-		emulated = kvmppc_ops->emulate_mfspr(vcpu, sprn,
-						     &spr_val);
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn,
+								  &spr_val);
 		if (unlikely(emulated == EMULATE_FAIL)) {
 			printk(KERN_INFO "mfspr: unknown spr "
 				"0x%x\n", sprn);
@@ -464,7 +464,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	if (emulated == EMULATE_FAIL) {
-		emulated = kvmppc_ops->emulate_op(run, vcpu, inst, &advance);
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst,
+							       &advance);
 		if (emulated == EMULATE_AGAIN) {
 			advance = 0;
 		} else if (emulated == EMULATE_FAIL) {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b103d747934a..0320c1721caa 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/module.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -39,7 +40,11 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
-struct kvmppc_ops *kvmppc_ops;
+struct kvmppc_ops *kvmppc_hv_ops;
+EXPORT_SYMBOL_GPL(kvmppc_hv_ops);
+struct kvmppc_ops *kvmppc_pr_ops;
+EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
+
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
@@ -195,7 +200,7 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
 		goto out;
 
 	/* HV KVM can only do PAPR mode for now */
-	if (!vcpu->arch.papr_enabled && kvmppc_ops->is_hv_enabled)
+	if (!vcpu->arch.papr_enabled && vcpu->kvm->arch.kvm_ops->is_hv_enabled)
 		goto out;
 
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -271,10 +276,35 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-	if (type)
-		return -EINVAL;
-
+	struct kvmppc_ops *kvm_ops = NULL;
+	/*
+	 * if we have both HV and PR enabled, default is HV
+	 */
+	if (type == 0) {
+		if (kvmppc_hv_ops)
+			kvm_ops = kvmppc_hv_ops;
+		else
+			kvm_ops = kvmppc_pr_ops;
+		if (!kvm_ops)
+			goto err_out;
+	} else	if (type == KVM_VM_PPC_HV) {
+		if (!kvmppc_hv_ops)
+			goto err_out;
+		kvm_ops = kvmppc_hv_ops;
+	} else if (type == KVM_VM_PPC_PR) {
+		if (!kvmppc_pr_ops)
+			goto err_out;
+		kvm_ops = kvmppc_pr_ops;
+	} else
+		goto err_out;
+
+	if (kvm_ops->owner && !try_module_get(kvm_ops->owner))
+		return -ENOENT;
+
+	kvm->arch.kvm_ops = kvm_ops;
 	return kvmppc_core_init_vm(kvm);
+err_out:
+	return -EINVAL;
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -294,6 +324,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvmppc_core_destroy_vm(kvm);
 
 	mutex_unlock(&kvm->lock);
+
+	/* drop the module reference */
+	module_put(kvm->arch.kvm_ops->owner);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -303,6 +336,10 @@ void kvm_arch_sync_events(struct kvm *kvm)
 int kvm_dev_ioctl_check_extension(long ext)
 {
 	int r;
+	/* FIXME!!
+	 * Should some of this be vm ioctl ? is it possible now ?
+	 */
+	int hv_enabled = kvmppc_hv_ops ? 1 : 0;
 
 	switch (ext) {
 #ifdef CONFIG_BOOKE
@@ -329,7 +366,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SW_TLB:
 #endif
 		/* We support this only for PR */
-		r = !kvmppc_ops->is_hv_enabled;
+		r = !hv_enabled;
 		break;
 #ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
@@ -354,13 +391,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	case KVM_CAP_PPC_SMT:
-		if (kvmppc_ops->is_hv_enabled)
+		if (hv_enabled)
 			r = threads_per_core;
 		else
 			r = 0;
 		break;
 	case KVM_CAP_PPC_RMA:
-		r = kvmppc_ops->is_hv_enabled;
+		r = hv_enabled;
 		/* PPC970 requires an RMA */
 		if (r && cpu_has_feature(CPU_FTR_ARCH_201))
 			r = 2;
@@ -368,7 +405,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-		if (kvmppc_ops->is_hv_enabled)
+		if (hv_enabled)
 			r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
 		else
 			r = 0;
@@ -380,7 +417,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		break;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	case KVM_CAP_PPC_HTAB_FD:
-		r = kvmppc_ops->is_hv_enabled;
+		r = hv_enabled;
 		break;
 #endif
 	case KVM_CAP_NR_VCPUS:
@@ -390,7 +427,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		 * will have secondary threads "offline"), and for other KVM
 		 * implementations just count online CPUs.
 		 */
-		if (kvmppc_ops->is_hv_enabled)
+		if (hv_enabled)
 			r = num_present_cpus();
 		else
 			r = num_online_cpus();
@@ -1039,9 +1076,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_PPC_GET_SMMU_INFO: {
 		struct kvm_ppc_smmu_info info;
+		struct kvm *kvm = filp->private_data;
 
 		memset(&info, 0, sizeof(info));
-		r = kvmppc_ops->get_smmu_info(kvm, &info);
+		r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info);
 		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
 			r = -EFAULT;
 		break;
@@ -1052,9 +1090,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
-	default:
-		r = kvmppc_ops->arch_vm_ioctl(filp, ioctl, arg);
-
+	default: {
+		struct kvm *kvm = filp->private_data;
+		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
+	}
 #else /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
@@ -1104,15 +1143,10 @@ EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
 
 int kvm_arch_init(void *opaque)
 {
-	if (kvmppc_ops) {
-		printk(KERN_ERR "kvm: already loaded the other module\n");
-		return -EEXIST;
-	}
-	kvmppc_ops = (struct kvmppc_ops *)opaque;
 	return 0;
 }
 
 void kvm_arch_exit(void)
 {
-	kvmppc_ops = NULL;
+
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e32e776f20c0..5b5341f78368 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -518,6 +518,10 @@ struct kvm_ppc_smmu_info {
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
 #define KVM_VM_S390_UCONTROL	1
 
+/* on ppc, 0 indicate default, 1 should force HV and 2 PR */
+#define KVM_VM_PPC_HV 1
+#define KVM_VM_PPC_PR 2
+
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
-- 
cgit v1.2.3


From 90af231106c0b8d223c27d35464af95cb3d9cacf Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 18 Oct 2013 17:43:38 +0200
Subject: bonding: add Netlink support mode option

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_netlink.c | 56 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h       | 10 +++++++
 2 files changed, 66 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 3e5c5f80c320..a94f870a6b60 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -20,6 +20,10 @@
 #include <net/rtnetlink.h>
 #include "bonding.h"
 
+static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
+	[IFLA_BOND_MODE]		= { .type = NLA_U8 },
+};
+
 static int bond_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -31,11 +35,63 @@ static int bond_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int bond_changelink(struct net_device *bond_dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	int err;
+
+	if (data && data[IFLA_BOND_MODE]) {
+		int mode = nla_get_u8(data[IFLA_BOND_MODE]);
+
+		err = bond_option_mode_set(bond, mode);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int bond_newlink(struct net *src_net, struct net_device *bond_dev,
+			struct nlattr *tb[], struct nlattr *data[])
+{
+	int err;
+
+	err = bond_changelink(bond_dev, tb, data);
+	if (err < 0)
+		return err;
+
+	return register_netdevice(bond_dev);
+}
+
+static size_t bond_get_size(const struct net_device *bond_dev)
+{
+	return nla_total_size(sizeof(u8));	/* IFLA_BOND_MODE */
+}
+
+static int bond_fill_info(struct sk_buff *skb,
+			  const struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (nla_put_u8(skb, IFLA_BOND_MODE, bond->params.mode))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 struct rtnl_link_ops bond_link_ops __read_mostly = {
 	.kind			= "bond",
 	.priv_size		= sizeof(struct bonding),
 	.setup			= bond_setup,
+	.maxtype		= IFLA_BOND_MAX,
+	.policy			= bond_policy,
 	.validate		= bond_validate,
+	.newlink		= bond_newlink,
+	.changelink		= bond_changelink,
+	.get_size		= bond_get_size,
+	.fill_info		= bond_fill_info,
 	.get_num_tx_queues	= bond_get_num_tx_queues,
 	.get_num_rx_queues	= bond_get_num_tx_queues, /* Use the same number
 							     as for TX queues */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 80394e8dc3a3..06fd3fe10f3b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -325,6 +325,16 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+/* Bonding section */
+
+enum {
+	IFLA_BOND_UNSPEC,
+	IFLA_BOND_MODE,
+	__IFLA_BOND_MAX,
+};
+
+#define IFLA_BOND_MAX	(__IFLA_BOND_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
-- 
cgit v1.2.3


From ec76aa49855f6d6fea5e01de179fb57dd47c619d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 18 Oct 2013 17:43:39 +0200
Subject: bonding: add Netlink support active_slave option

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_netlink.c | 23 ++++++++++++++++++++++-
 include/uapi/linux/if_link.h       |  1 +
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index a94f870a6b60..fe3500bb34e4 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -22,6 +22,7 @@
 
 static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_MODE]		= { .type = NLA_U8 },
+	[IFLA_BOND_ACTIVE_SLAVE]	= { .type = NLA_U32 },
 };
 
 static int bond_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -48,6 +49,22 @@ static int bond_changelink(struct net_device *bond_dev,
 		if (err)
 			return err;
 	}
+	if (data && data[IFLA_BOND_ACTIVE_SLAVE]) {
+		int ifindex = nla_get_u32(data[IFLA_BOND_ACTIVE_SLAVE]);
+		struct net_device *slave_dev;
+
+		if (ifindex == 0) {
+			slave_dev = NULL;
+		} else {
+			slave_dev = __dev_get_by_index(dev_net(bond_dev),
+						       ifindex);
+			if (!slave_dev)
+				return -ENODEV;
+		}
+		err = bond_option_active_slave_set(bond, slave_dev);
+		if (err)
+			return err;
+	}
 	return 0;
 }
 
@@ -66,14 +83,18 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev,
 static size_t bond_get_size(const struct net_device *bond_dev)
 {
 	return nla_total_size(sizeof(u8));	/* IFLA_BOND_MODE */
+		+ nla_total_size(sizeof(u32));	/* IFLA_BOND_ACTIVE_SLAVE */
 }
 
 static int bond_fill_info(struct sk_buff *skb,
 			  const struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct net_device *slave_dev = bond_option_active_slave_get(bond);
 
-	if (nla_put_u8(skb, IFLA_BOND_MODE, bond->params.mode))
+	if (nla_put_u8(skb, IFLA_BOND_MODE, bond->params.mode) ||
+	    (slave_dev &&
+	     nla_put_u32(skb, IFLA_BOND_ACTIVE_SLAVE, slave_dev->ifindex)))
 		goto nla_put_failure;
 	return 0;
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 06fd3fe10f3b..8a1e346243b7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -330,6 +330,7 @@ struct ifla_vxlan_port_range {
 enum {
 	IFLA_BOND_UNSPEC,
 	IFLA_BOND_MODE,
+	IFLA_BOND_ACTIVE_SLAVE,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From 1bd7116f1cb833c998cddb6b188df463342069d8 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Tue, 22 Oct 2013 10:42:46 -0700
Subject: openvswitch: collect mega flow mask stats

Collect mega flow mask stats. ovs-dpctl show command can be used to
display them for debugging and performance tuning.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/uapi/linux/openvswitch.h | 17 ++++++++++++++---
 net/openvswitch/datapath.c       | 38 +++++++++++++++++++++++++++++++-------
 net/openvswitch/datapath.h       |  4 ++++
 net/openvswitch/flow_table.c     | 16 +++++++++++++++-
 net/openvswitch/flow_table.h     |  4 +++-
 5 files changed, 67 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a74d375b439b..2cc4644f68ef 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -63,15 +63,18 @@ enum ovs_datapath_cmd {
  * not be sent.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
+ * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
+ * datapath. Always present in notifications.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_DP_* commands.
  */
 enum ovs_datapath_attr {
 	OVS_DP_ATTR_UNSPEC,
-	OVS_DP_ATTR_NAME,       /* name of dp_ifindex netdev */
-	OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
-	OVS_DP_ATTR_STATS,      /* struct ovs_dp_stats */
+	OVS_DP_ATTR_NAME,		/* name of dp_ifindex netdev */
+	OVS_DP_ATTR_UPCALL_PID,		/* Netlink PID to receive upcalls */
+	OVS_DP_ATTR_STATS,		/* struct ovs_dp_stats */
+	OVS_DP_ATTR_MEGAFLOW_STATS,	/* struct ovs_dp_megaflow_stats */
 	__OVS_DP_ATTR_MAX
 };
 
@@ -84,6 +87,14 @@ struct ovs_dp_stats {
 	__u64 n_flows;           /* Number of flows present */
 };
 
+struct ovs_dp_megaflow_stats {
+	__u64 n_mask_hit;	 /* Number of masks used for flow lookups. */
+	__u32 n_masks;		 /* Number of masks for the datapath. */
+	__u32 pad0;		 /* Pad for future expension. */
+	__u64 pad1;		 /* Pad for future expension. */
+	__u64 pad2;		 /* Pad for future expension. */
+};
+
 struct ovs_vport_stats {
 	__u64   rx_packets;		/* total packets received       */
 	__u64   tx_packets;		/* total packets transmitted    */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index cf270973095d..5bc5a4e64758 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -221,6 +221,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	struct dp_stats_percpu *stats;
 	struct sw_flow_key key;
 	u64 *stats_counter;
+	u32 n_mask_hit;
 	int error;
 
 	stats = this_cpu_ptr(dp->stats_percpu);
@@ -233,7 +234,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	}
 
 	/* Look up flow. */
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = ovs_flow_tbl_lookup(&dp->table, &key, &n_mask_hit);
 	if (unlikely(!flow)) {
 		struct dp_upcall_info upcall;
 
@@ -258,6 +259,7 @@ out:
 	/* Update datapath statistics. */
 	u64_stats_update_begin(&stats->sync);
 	(*stats_counter)++;
+	stats->n_mask_hit += n_mask_hit;
 	u64_stats_update_end(&stats->sync);
 }
 
@@ -563,13 +565,18 @@ static struct genl_ops dp_packet_genl_ops[] = {
 	}
 };
 
-static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
+static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
+			 struct ovs_dp_megaflow_stats *mega_stats)
 {
 	int i;
 
+	memset(mega_stats, 0, sizeof(*mega_stats));
+
 	stats->n_flows = ovs_flow_tbl_count(&dp->table);
+	mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
 
 	stats->n_hit = stats->n_missed = stats->n_lost = 0;
+
 	for_each_possible_cpu(i) {
 		const struct dp_stats_percpu *percpu_stats;
 		struct dp_stats_percpu local_stats;
@@ -585,6 +592,7 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
 		stats->n_hit += local_stats.n_hit;
 		stats->n_missed += local_stats.n_missed;
 		stats->n_lost += local_stats.n_lost;
+		mega_stats->n_mask_hit += local_stats.n_mask_hit;
 	}
 }
 
@@ -743,6 +751,14 @@ static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
 	return skb;
 }
 
+static struct sw_flow *__ovs_flow_tbl_lookup(struct flow_table *tbl,
+					      const struct sw_flow_key *key)
+{
+	u32 __always_unused n_mask_hit;
+
+	return ovs_flow_tbl_lookup(tbl, key, &n_mask_hit);
+}
+
 static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
@@ -793,7 +809,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_ovs;
 
 	/* Check if this is a duplicate flow */
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = __ovs_flow_tbl_lookup(&dp->table, &key);
 	if (!flow) {
 		/* Bail out if we're not allowed to create a new flow. */
 		error = -ENOENT;
@@ -905,7 +921,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = __ovs_flow_tbl_lookup(&dp->table, &key);
 	if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
 		err = -ENOENT;
 		goto unlock;
@@ -953,7 +969,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto unlock;
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = __ovs_flow_tbl_lookup(&dp->table, &key);
 	if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
 		err = -ENOENT;
 		goto unlock;
@@ -1067,6 +1083,7 @@ static size_t ovs_dp_cmd_msg_size(void)
 
 	msgsize += nla_total_size(IFNAMSIZ);
 	msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
+	msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
 
 	return msgsize;
 }
@@ -1076,6 +1093,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
 {
 	struct ovs_header *ovs_header;
 	struct ovs_dp_stats dp_stats;
+	struct ovs_dp_megaflow_stats dp_megaflow_stats;
 	int err;
 
 	ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
@@ -1091,8 +1109,14 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
 	if (err)
 		goto nla_put_failure;
 
-	get_dp_stats(dp, &dp_stats);
-	if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats))
+	get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
+	if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+			&dp_stats))
+		goto nla_put_failure;
+
+	if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+			sizeof(struct ovs_dp_megaflow_stats),
+			&dp_megaflow_stats))
 		goto nla_put_failure;
 
 	return genlmsg_end(skb, ovs_header);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index acfd4af8ca3a..d3d14a58aa91 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -46,11 +46,15 @@
  * @n_lost: Number of received packets that had no matching flow in the flow
  * table that could not be sent to userspace (normally due to an overflow in
  * one of the datapath's queues).
+ * @n_mask_hit: Number of masks looked up for flow match.
+ *   @n_mask_hit / (@n_hit + @n_missed)  will be the average masks looked
+ *   up per packet.
  */
 struct dp_stats_percpu {
 	u64 n_hit;
 	u64 n_missed;
 	u64 n_lost;
+	u64 n_mask_hit;
 	struct u64_stats_sync sync;
 };
 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 036e019f8c3c..536b4d2a42e2 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -430,13 +430,16 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
 }
 
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
-				    const struct sw_flow_key *key)
+				    const struct sw_flow_key *key,
+				    u32 *n_mask_hit)
 {
 	struct table_instance *ti = rcu_dereference(tbl->ti);
 	struct sw_flow_mask *mask;
 	struct sw_flow *flow;
 
+	*n_mask_hit = 0;
 	list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+		(*n_mask_hit)++;
 		flow = masked_flow_lookup(ti, key, mask);
 		if (flow)  /* Found */
 			return flow;
@@ -444,6 +447,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	return NULL;
 }
 
+int ovs_flow_tbl_num_masks(const struct flow_table *table)
+{
+	struct sw_flow_mask *mask;
+	int num = 0;
+
+	list_for_each_entry(mask, &table->mask_list, list)
+		num++;
+
+	return num;
+}
+
 static struct table_instance *table_instance_expand(struct table_instance *ti)
 {
 	return table_instance_rehash(ti, ti->n_buckets * 2);
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 4db5f78b6f81..fbe45d5ad07d 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -66,10 +66,12 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table);
 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
 			struct sw_flow_mask *mask);
 void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+int  ovs_flow_tbl_num_masks(const struct flow_table *table);
 struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
 				       u32 *bucket, u32 *idx);
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
-				    const struct sw_flow_key *);
+				    const struct sw_flow_key *,
+				    u32 *n_mask_hit);
 
 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
 			       struct sw_flow_match *match);
-- 
cgit v1.2.3


From ee08997fee16f10be23c9748d609dbdf3baab8e4 Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Mon, 6 May 2013 15:40:01 +0300
Subject: crypto: provide single place for hash algo information

This patch provides a single place for information about hash algorithms,
such as hash sizes and kernel driver names, which will be used by IMA
and the public key code.

Changelog:
- Fix sparse and checkpatch warnings
- Move hash algo enums to uapi for userspace signing functions.

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                 |  3 +++
 crypto/Makefile                |  1 +
 crypto/hash_info.c             | 56 ++++++++++++++++++++++++++++++++++++++++++
 include/crypto/hash_info.h     | 40 ++++++++++++++++++++++++++++++
 include/uapi/linux/hash_info.h | 37 ++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100644 crypto/hash_info.c
 create mode 100644 include/crypto/hash_info.h
 create mode 100644 include/uapi/linux/hash_info.h

(limited to 'include/uapi/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 69ce573f1224..ba061b091d9f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1386,6 +1386,9 @@ config CRYPTO_USER_API_SKCIPHER
 	  This option enables the user-spaces interface for symmetric
 	  key cipher algorithms.
 
+config CRYPTO_HASH_INFO
+	bool
+
 source "drivers/crypto/Kconfig"
 source crypto/asymmetric_keys/Kconfig
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 80019ba8da3a..b3a7e807e08b 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
 obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/
+obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o
diff --git a/crypto/hash_info.c b/crypto/hash_info.c
new file mode 100644
index 000000000000..3e7ff46f26e8
--- /dev/null
+++ b/crypto/hash_info.c
@@ -0,0 +1,56 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= "md4",
+	[HASH_ALGO_MD5]		= "md5",
+	[HASH_ALGO_SHA1]	= "sha1",
+	[HASH_ALGO_RIPE_MD_160]	= "rmd160",
+	[HASH_ALGO_SHA256]	= "sha256",
+	[HASH_ALGO_SHA384]	= "sha384",
+	[HASH_ALGO_SHA512]	= "sha512",
+	[HASH_ALGO_SHA224]	= "sha224",
+	[HASH_ALGO_RIPE_MD_128]	= "rmd128",
+	[HASH_ALGO_RIPE_MD_256]	= "rmd256",
+	[HASH_ALGO_RIPE_MD_320]	= "rmd320",
+	[HASH_ALGO_WP_256]	= "wp256",
+	[HASH_ALGO_WP_384]	= "wp384",
+	[HASH_ALGO_WP_512]	= "wp512",
+	[HASH_ALGO_TGR_128]	= "tgr128",
+	[HASH_ALGO_TGR_160]	= "tgr160",
+	[HASH_ALGO_TGR_192]	= "tgr192",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_MD5]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_SHA1]	= SHA1_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_160]	= RMD160_DIGEST_SIZE,
+	[HASH_ALGO_SHA256]	= SHA256_DIGEST_SIZE,
+	[HASH_ALGO_SHA384]	= SHA384_DIGEST_SIZE,
+	[HASH_ALGO_SHA512]	= SHA512_DIGEST_SIZE,
+	[HASH_ALGO_SHA224]	= SHA224_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_128]	= RMD128_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_256]	= RMD256_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_320]	= RMD320_DIGEST_SIZE,
+	[HASH_ALGO_WP_256]	= WP256_DIGEST_SIZE,
+	[HASH_ALGO_WP_384]	= WP384_DIGEST_SIZE,
+	[HASH_ALGO_WP_512]	= WP512_DIGEST_SIZE,
+	[HASH_ALGO_TGR_128]	= TGR128_DIGEST_SIZE,
+	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
+	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
new file mode 100644
index 000000000000..e1e5a3e5dd1b
--- /dev/null
+++ b/include/crypto/hash_info.h
@@ -0,0 +1,40 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_HASH_INFO_H
+#define _CRYPTO_HASH_INFO_H
+
+#include <crypto/sha.h>
+#include <crypto/md5.h>
+
+#include <uapi/linux/hash_info.h>
+
+/* not defined in include/crypto/ */
+#define RMD128_DIGEST_SIZE      16
+#define RMD160_DIGEST_SIZE	20
+#define RMD256_DIGEST_SIZE      32
+#define RMD320_DIGEST_SIZE      40
+
+/* not defined in include/crypto/ */
+#define WP512_DIGEST_SIZE	64
+#define WP384_DIGEST_SIZE	48
+#define WP256_DIGEST_SIZE	32
+
+/* not defined in include/crypto/ */
+#define TGR128_DIGEST_SIZE 16
+#define TGR160_DIGEST_SIZE 20
+#define TGR192_DIGEST_SIZE 24
+
+extern const char *const hash_algo_name[HASH_ALGO__LAST];
+extern const int hash_digest_size[HASH_ALGO__LAST];
+
+#endif /* _CRYPTO_HASH_INFO_H */
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
new file mode 100644
index 000000000000..ca18c45f8304
--- /dev/null
+++ b/include/uapi/linux/hash_info.h
@@ -0,0 +1,37 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _UAPI_LINUX_HASH_INFO_H
+#define _UAPI_LINUX_HASH_INFO_H
+
+enum hash_algo {
+	HASH_ALGO_MD4,
+	HASH_ALGO_MD5,
+	HASH_ALGO_SHA1,
+	HASH_ALGO_RIPE_MD_160,
+	HASH_ALGO_SHA256,
+	HASH_ALGO_SHA384,
+	HASH_ALGO_SHA512,
+	HASH_ALGO_SHA224,
+	HASH_ALGO_RIPE_MD_128,
+	HASH_ALGO_RIPE_MD_256,
+	HASH_ALGO_RIPE_MD_320,
+	HASH_ALGO_WP_256,
+	HASH_ALGO_WP_384,
+	HASH_ALGO_WP_512,
+	HASH_ALGO_TGR_128,
+	HASH_ALGO_TGR_160,
+	HASH_ALGO_TGR_192,
+	HASH_ALGO__LAST
+};
+
+#endif /* _UAPI_LINUX_HASH_INFO_H */
-- 
cgit v1.2.3


From 5336fa88e8ac6b666a3db9902a4797d94d86a702 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 7 Oct 2013 18:41:05 +0200
Subject: nl80211/cfg80211: enable DFS for IBSS mode

To use DFS in IBSS mode, userspace is required to react to radar events.
It can inform nl80211 that it is capable of doing so by adding a
NL80211_ATTR_HANDLE_DFS attribute when joining the IBSS.

This attribute is supplied to let the kernelspace know that the
userspace application can and will handle radar events, e.g. by
intiating channel switches to a valid channel. DFS channels may
only be used if this attribute is supplied and the driver supports
it. Driver support will be checked even if a channel without DFS
will be initially joined, as a DFS channel may be chosen later.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
[fix attribute name in commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  6 ++++++
 include/uapi/linux/nl80211.h |  9 +++++++++
 net/wireless/chan.c          |  3 ++-
 net/wireless/ibss.c          | 24 ++++++++++++++++++++----
 net/wireless/nl80211.c       |  8 ++++++--
 net/wireless/util.c          | 14 ++++++++++----
 6 files changed, 53 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5db5fe24eff6..b1acf36e5f45 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1664,6 +1664,9 @@ struct cfg80211_disassoc_request {
  *	sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
  *	required to assume that the port is unauthorized until authorized by
  *	user space. Otherwise, port is marked authorized by default.
+ * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
+ *	changes the channel when a radar is detected. This is required
+ *	to operate on DFS channels.
  * @basic_rates: bitmap of basic rates to use when creating the IBSS
  * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
  * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
@@ -1681,6 +1684,7 @@ struct cfg80211_ibss_params {
 	bool channel_fixed;
 	bool privacy;
 	bool control_port;
+	bool userspace_handles_dfs;
 	int mcast_rate[IEEE80211_NUM_BANDS];
 	struct ieee80211_ht_cap ht_capa;
 	struct ieee80211_ht_cap ht_capa_mask;
@@ -3061,6 +3065,7 @@ struct cfg80211_cached_keys;
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
  * @ibss_fixed: (private) IBSS is using fixed BSSID
+ * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
  * @event_lock: (private) lock for event list
  */
@@ -3099,6 +3104,7 @@ struct wireless_dev {
 	struct ieee80211_channel *channel;
 
 	bool ibss_fixed;
+	bool ibss_dfs_possible;
 
 	bool ps;
 	int ps_timeout;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f2aef2a7a570..f752e9821e71 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1501,6 +1501,13 @@ enum nl80211_commands {
  * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported
  *      supported operating classes.
  *
+ * @NL80211_ATTR_HANDLE_DFS: A flag indicating whether user space
+ *	controls DFS operation in IBSS mode. If the flag is included in
+ *	%NL80211_CMD_JOIN_IBSS request, the driver will allow use of DFS
+ *	channels and reports radar events to userspace. Userspace is required
+ *	to react to radar events, e.g. initiate a channel switch or leave the
+ *	IBSS network.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1815,6 +1822,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES,
 
+	NL80211_ATTR_HANDLE_DFS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 16f3c3a7b2c1..9b8cc877eb19 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -504,7 +504,8 @@ cfg80211_get_chan_state(struct wireless_dev *wdev,
 	case NL80211_IFTYPE_ADHOC:
 		if (wdev->current_bss) {
 			*chan = wdev->current_bss->pub.channel;
-			*chanmode = wdev->ibss_fixed
+			*chanmode = (wdev->ibss_fixed &&
+				     !wdev->ibss_dfs_possible)
 				  ? CHAN_MODE_SHARED
 				  : CHAN_MODE_EXCLUSIVE;
 			return;
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 39bff7d36768..fa7461b6ba39 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -83,6 +83,8 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 			 struct cfg80211_cached_keys *connkeys)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_channel *check_chan;
+	u8 radar_detect_width = 0;
 	int err;
 
 	ASSERT_WDEV_LOCK(wdev);
@@ -114,14 +116,28 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	wdev->connect_keys = connkeys;
 
 	wdev->ibss_fixed = params->channel_fixed;
+	wdev->ibss_dfs_possible = params->userspace_handles_dfs;
 #ifdef CONFIG_CFG80211_WEXT
 	wdev->wext.ibss.chandef = params->chandef;
 #endif
+	check_chan = params->chandef.chan;
+	if (params->userspace_handles_dfs) {
+		/* use channel NULL to check for radar even if the current
+		 * channel is not a radar channel - it might decide to change
+		 * to DFS channel later.
+		 */
+		radar_detect_width = BIT(params->chandef.width);
+		check_chan = NULL;
+	}
+
+	err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype,
+					   check_chan,
+					   (params->channel_fixed &&
+					    !radar_detect_width)
+					   ? CHAN_MODE_SHARED
+					   : CHAN_MODE_EXCLUSIVE,
+					   radar_detect_width);
 
-	err = cfg80211_can_use_chan(rdev, wdev, params->chandef.chan,
-				    params->channel_fixed
-				    ? CHAN_MODE_SHARED
-				    : CHAN_MODE_EXCLUSIVE);
 	if (err) {
 		wdev->connect_keys = NULL;
 		return err;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 460638ac2d73..7502d33a3a70 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -356,6 +356,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 },
 	[NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY },
 	[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY },
+	[NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -5768,9 +5769,9 @@ skip_beacons:
 	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef))
 		return -EINVAL;
 
-	/* DFS channels are only supported for AP/P2P GO ... for now. */
 	if (dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP ||
-	    dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) {
+	    dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO ||
+	    dev->ieee80211_ptr->iftype == NL80211_IFTYPE_ADHOC) {
 		err = cfg80211_chandef_dfs_required(wdev->wiphy,
 						    &params.chandef);
 		if (err < 0) {
@@ -6602,6 +6603,9 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	ibss.control_port =
 		nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]);
 
+	ibss.userspace_handles_dfs =
+		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
+
 	err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
 	if (err)
 		kfree(connkeys);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3c8be6104ba4..935dea9485da 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1249,7 +1249,7 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
 	enum cfg80211_chan_mode chmode;
 	int num_different_channels = 0;
 	int total = 1;
-	bool radar_required;
+	bool radar_required = false;
 	int i, j;
 
 	ASSERT_RTNL();
@@ -1264,14 +1264,20 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
 	case NL80211_IFTYPE_MESH_POINT:
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_WDS:
-		radar_required = !!(chan &&
-				    (chan->flags & IEEE80211_CHAN_RADAR));
+		/* if the interface could potentially choose a DFS channel,
+		 * then mark DFS as required.
+		 */
+		if (!chan) {
+			if (chanmode != CHAN_MODE_UNDEFINED && radar_detect)
+				radar_required = true;
+			break;
+		}
+		radar_required = !!(chan->flags & IEEE80211_CHAN_RADAR);
 		break;
 	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_DEVICE:
 	case NL80211_IFTYPE_MONITOR:
-		radar_required = false;
 		break;
 	case NUM_NL80211_IFTYPES:
 	case NL80211_IFTYPE_UNSPECIFIED:
-- 
cgit v1.2.3


From 4571912743ac6a04a6644e5a292bb9876bb5329b Mon Sep 17 00:00:00 2001
From: Archit Taneja <archit@ti.com>
Date: Wed, 16 Oct 2013 02:36:47 -0300
Subject: [media] v4l: ti-vpe: Add VPE mem to mem driver

VPE is a block which consists of a single memory to memory path which
can perform chrominance up/down sampling, de-interlacing, scaling, and
color space conversion of raster or tiled YUV420 coplanar, YUV422
coplanar or YUV422 interleaved video formats.

We create a mem2mem driver based primarily on the mem2mem-testdev
example. The de-interlacer, scaler and color space converter are all
bypassed for now to keep the driver simple. Chroma up/down sampler
blocks are implemented, so conversion beteen different YUV formats is
possible.

Each mem2mem context allocates a buffer for VPE MMR values which it will
use when it gets access to the VPE HW via the mem2mem queue, it also
allocates a VPDMA descriptor list to which configuration and data
descriptors are added.

Based on the information received via v4l2 ioctls for the source and
destination queues, the driver configures the values for the MMRs, and
stores them in the buffer. There are also some VPDMA parameters like
frame start and line mode which needs to be configured, these are
configured by direct register writes via the VPDMA helper functions.

The driver's device_run() mem2mem op will add each descriptor based on
how the source and destination queues are set up for the given ctx, once
the list is prepared, it's submitted to VPDMA, these descriptors when
parsed by VPDMA will upload MMR registers, start DMA of video buffers on
the various input and output clients/ports.

When the list is parsed completely(and the DMAs on all the output ports
done), an interrupt is generated which we use to notify that the source
and destination buffers are done. The rest of the driver is quite
similar to other mem2mem drivers, we use the multiplane v4l2 ioctls as
the HW support coplanar formats.

Signed-off-by: Archit Taneja <archit@ti.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Kamil Debski <k.debski@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
---
 drivers/media/platform/Kconfig           |   16 +
 drivers/media/platform/Makefile          |    2 +
 drivers/media/platform/ti-vpe/Makefile   |    5 +
 drivers/media/platform/ti-vpe/vpe.c      | 1775 ++++++++++++++++++++++++++++++
 drivers/media/platform/ti-vpe/vpe_regs.h |  496 +++++++++
 include/uapi/linux/v4l2-controls.h       |    4 +
 6 files changed, 2298 insertions(+)
 create mode 100644 drivers/media/platform/ti-vpe/Makefile
 create mode 100644 drivers/media/platform/ti-vpe/vpe.c
 create mode 100644 drivers/media/platform/ti-vpe/vpe_regs.h

(limited to 'include/uapi/linux')

diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index 29acc2d2aee6..2405ef7c6604 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -220,6 +220,22 @@ config VIDEO_RENESAS_VSP1
 	  To compile this driver as a module, choose M here: the module
 	  will be called vsp1.
 
+config VIDEO_TI_VPE
+	tristate "TI VPE (Video Processing Engine) driver"
+	depends on VIDEO_DEV && VIDEO_V4L2 && SOC_DRA7XX
+	select VIDEOBUF2_DMA_CONTIG
+	select V4L2_MEM2MEM_DEV
+	default n
+	---help---
+	  Support for the TI VPE(Video Processing Engine) block
+	  found on DRA7XX SoC.
+
+config VIDEO_TI_VPE_DEBUG
+	bool "VPE debug messages"
+	depends on VIDEO_TI_VPE
+	---help---
+	  Enable debug messages on VPE driver.
+
 endif # V4L_MEM2MEM_DRIVERS
 
 menuconfig V4L_TEST_DRIVERS
diff --git a/drivers/media/platform/Makefile b/drivers/media/platform/Makefile
index 4e4da482c522..1348ba1faf92 100644
--- a/drivers/media/platform/Makefile
+++ b/drivers/media/platform/Makefile
@@ -22,6 +22,8 @@ obj-$(CONFIG_VIDEO_VIVI) += vivi.o
 
 obj-$(CONFIG_VIDEO_MEM2MEM_TESTDEV) += mem2mem_testdev.o
 
+obj-$(CONFIG_VIDEO_TI_VPE)		+= ti-vpe/
+
 obj-$(CONFIG_VIDEO_MX2_EMMAPRP)		+= mx2_emmaprp.o
 obj-$(CONFIG_VIDEO_CODA) 		+= coda.o
 
diff --git a/drivers/media/platform/ti-vpe/Makefile b/drivers/media/platform/ti-vpe/Makefile
new file mode 100644
index 000000000000..cbf0a806ba1d
--- /dev/null
+++ b/drivers/media/platform/ti-vpe/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_VIDEO_TI_VPE) += ti-vpe.o
+
+ti-vpe-y := vpe.o vpdma.o
+
+ccflags-$(CONFIG_VIDEO_TI_VPE_DEBUG) += -DDEBUG
diff --git a/drivers/media/platform/ti-vpe/vpe.c b/drivers/media/platform/ti-vpe/vpe.c
new file mode 100644
index 000000000000..3bd9ca658b54
--- /dev/null
+++ b/drivers/media/platform/ti-vpe/vpe.c
@@ -0,0 +1,1775 @@
+/*
+ * TI VPE mem2mem driver, based on the virtual v4l2-mem2mem example driver
+ *
+ * Copyright (c) 2013 Texas Instruments Inc.
+ * David Griego, <dagriego@biglakesoftware.com>
+ * Dale Farnsworth, <dale@farnsworth.org>
+ * Archit Taneja, <archit@ti.com>
+ *
+ * Copyright (c) 2009-2010 Samsung Electronics Co., Ltd.
+ * Pawel Osciak, <pawel@osciak.com>
+ * Marek Szyprowski, <m.szyprowski@samsung.com>
+ *
+ * Based on the virtual v4l2-mem2mem example device
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation
+ */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/videodev2.h>
+
+#include <media/v4l2-common.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-event.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-mem2mem.h>
+#include <media/videobuf2-core.h>
+#include <media/videobuf2-dma-contig.h>
+
+#include "vpdma.h"
+#include "vpe_regs.h"
+
+#define VPE_MODULE_NAME "vpe"
+
+/* minimum and maximum frame sizes */
+#define MIN_W		128
+#define MIN_H		128
+#define MAX_W		1920
+#define MAX_H		1080
+
+/* required alignments */
+#define S_ALIGN		0	/* multiple of 1 */
+#define H_ALIGN		1	/* multiple of 2 */
+#define W_ALIGN		1	/* multiple of 2 */
+
+/* multiple of 128 bits, line stride, 16 bytes */
+#define L_ALIGN		4
+
+/* flags that indicate a format can be used for capture/output */
+#define VPE_FMT_TYPE_CAPTURE	(1 << 0)
+#define VPE_FMT_TYPE_OUTPUT	(1 << 1)
+
+/* used as plane indices */
+#define VPE_MAX_PLANES	2
+#define VPE_LUMA	0
+#define VPE_CHROMA	1
+
+/* per m2m context info */
+#define VPE_DEF_BUFS_PER_JOB	1	/* default one buffer per batch job */
+
+/*
+ * each VPE context can need up to 3 config desciptors, 7 input descriptors,
+ * 3 output descriptors, and 10 control descriptors
+ */
+#define VPE_DESC_LIST_SIZE	(10 * VPDMA_DTD_DESC_SIZE +	\
+					13 * VPDMA_CFD_CTD_DESC_SIZE)
+
+#define vpe_dbg(vpedev, fmt, arg...)	\
+		dev_dbg((vpedev)->v4l2_dev.dev, fmt, ##arg)
+#define vpe_err(vpedev, fmt, arg...)	\
+		dev_err((vpedev)->v4l2_dev.dev, fmt, ##arg)
+
+struct vpe_us_coeffs {
+	unsigned short	anchor_fid0_c0;
+	unsigned short	anchor_fid0_c1;
+	unsigned short	anchor_fid0_c2;
+	unsigned short	anchor_fid0_c3;
+	unsigned short	interp_fid0_c0;
+	unsigned short	interp_fid0_c1;
+	unsigned short	interp_fid0_c2;
+	unsigned short	interp_fid0_c3;
+	unsigned short	anchor_fid1_c0;
+	unsigned short	anchor_fid1_c1;
+	unsigned short	anchor_fid1_c2;
+	unsigned short	anchor_fid1_c3;
+	unsigned short	interp_fid1_c0;
+	unsigned short	interp_fid1_c1;
+	unsigned short	interp_fid1_c2;
+	unsigned short	interp_fid1_c3;
+};
+
+/*
+ * Default upsampler coefficients
+ */
+static const struct vpe_us_coeffs us_coeffs[] = {
+	{
+		/* Coefficients for progressive input */
+		0x00C8, 0x0348, 0x0018, 0x3FD8, 0x3FB8, 0x0378, 0x00E8, 0x3FE8,
+		0x00C8, 0x0348, 0x0018, 0x3FD8, 0x3FB8, 0x0378, 0x00E8, 0x3FE8,
+	},
+};
+
+/*
+ * The port_data structure contains per-port data.
+ */
+struct vpe_port_data {
+	enum vpdma_channel channel;	/* VPDMA channel */
+	u8	vb_part;		/* plane index for co-panar formats */
+};
+
+/*
+ * Define indices into the port_data tables
+ */
+#define VPE_PORT_LUMA1_IN	0
+#define VPE_PORT_CHROMA1_IN	1
+#define VPE_PORT_LUMA_OUT	8
+#define VPE_PORT_CHROMA_OUT	9
+#define VPE_PORT_RGB_OUT	10
+
+static const struct vpe_port_data port_data[11] = {
+	[VPE_PORT_LUMA1_IN] = {
+		.channel	= VPE_CHAN_LUMA1_IN,
+		.vb_part	= VPE_LUMA,
+	},
+	[VPE_PORT_CHROMA1_IN] = {
+		.channel	= VPE_CHAN_CHROMA1_IN,
+		.vb_part	= VPE_CHROMA,
+	},
+	[VPE_PORT_LUMA_OUT] = {
+		.channel	= VPE_CHAN_LUMA_OUT,
+		.vb_part	= VPE_LUMA,
+	},
+	[VPE_PORT_CHROMA_OUT] = {
+		.channel	= VPE_CHAN_CHROMA_OUT,
+		.vb_part	= VPE_CHROMA,
+	},
+	[VPE_PORT_RGB_OUT] = {
+		.channel	= VPE_CHAN_RGB_OUT,
+		.vb_part	= VPE_LUMA,
+	},
+};
+
+
+/* driver info for each of the supported video formats */
+struct vpe_fmt {
+	char	*name;			/* human-readable name */
+	u32	fourcc;			/* standard format identifier */
+	u8	types;			/* CAPTURE and/or OUTPUT */
+	u8	coplanar;		/* set for unpacked Luma and Chroma */
+	/* vpdma format info for each plane */
+	struct vpdma_data_format const *vpdma_fmt[VPE_MAX_PLANES];
+};
+
+static struct vpe_fmt vpe_formats[] = {
+	{
+		.name		= "YUV 422 co-planar",
+		.fourcc		= V4L2_PIX_FMT_NV16,
+		.types		= VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT,
+		.coplanar	= 1,
+		.vpdma_fmt	= { &vpdma_yuv_fmts[VPDMA_DATA_FMT_Y444],
+				    &vpdma_yuv_fmts[VPDMA_DATA_FMT_C444],
+				  },
+	},
+	{
+		.name		= "YUV 420 co-planar",
+		.fourcc		= V4L2_PIX_FMT_NV12,
+		.types		= VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT,
+		.coplanar	= 1,
+		.vpdma_fmt	= { &vpdma_yuv_fmts[VPDMA_DATA_FMT_Y420],
+				    &vpdma_yuv_fmts[VPDMA_DATA_FMT_C420],
+				  },
+	},
+	{
+		.name		= "YUYV 422 packed",
+		.fourcc		= V4L2_PIX_FMT_YUYV,
+		.types		= VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT,
+		.coplanar	= 0,
+		.vpdma_fmt	= { &vpdma_yuv_fmts[VPDMA_DATA_FMT_YC422],
+				  },
+	},
+	{
+		.name		= "UYVY 422 packed",
+		.fourcc		= V4L2_PIX_FMT_UYVY,
+		.types		= VPE_FMT_TYPE_CAPTURE | VPE_FMT_TYPE_OUTPUT,
+		.coplanar	= 0,
+		.vpdma_fmt	= { &vpdma_yuv_fmts[VPDMA_DATA_FMT_CY422],
+				  },
+	},
+};
+
+/*
+ * per-queue, driver-specific private data.
+ * there is one source queue and one destination queue for each m2m context.
+ */
+struct vpe_q_data {
+	unsigned int		width;				/* frame width */
+	unsigned int		height;				/* frame height */
+	unsigned int		bytesperline[VPE_MAX_PLANES];	/* bytes per line in memory */
+	enum v4l2_colorspace	colorspace;
+	unsigned int		flags;
+	unsigned int		sizeimage[VPE_MAX_PLANES];	/* image size in memory */
+	struct v4l2_rect	c_rect;				/* crop/compose rectangle */
+	struct vpe_fmt		*fmt;				/* format info */
+};
+
+/* vpe_q_data flag bits */
+#define	Q_DATA_FRAME_1D		(1 << 0)
+#define	Q_DATA_MODE_TILED	(1 << 1)
+
+enum {
+	Q_DATA_SRC = 0,
+	Q_DATA_DST = 1,
+};
+
+/* find our format description corresponding to the passed v4l2_format */
+static struct vpe_fmt *find_format(struct v4l2_format *f)
+{
+	struct vpe_fmt *fmt;
+	unsigned int k;
+
+	for (k = 0; k < ARRAY_SIZE(vpe_formats); k++) {
+		fmt = &vpe_formats[k];
+		if (fmt->fourcc == f->fmt.pix.pixelformat)
+			return fmt;
+	}
+
+	return NULL;
+}
+
+/*
+ * there is one vpe_dev structure in the driver, it is shared by
+ * all instances.
+ */
+struct vpe_dev {
+	struct v4l2_device	v4l2_dev;
+	struct video_device	vfd;
+	struct v4l2_m2m_dev	*m2m_dev;
+
+	atomic_t		num_instances;	/* count of driver instances */
+	dma_addr_t		loaded_mmrs;	/* shadow mmrs in device */
+	struct mutex		dev_mutex;
+	spinlock_t		lock;
+
+	int			irq;
+	void __iomem		*base;
+
+	struct vb2_alloc_ctx	*alloc_ctx;
+	struct vpdma_data	*vpdma;		/* vpdma data handle */
+};
+
+/*
+ * There is one vpe_ctx structure for each m2m context.
+ */
+struct vpe_ctx {
+	struct v4l2_fh		fh;
+	struct vpe_dev		*dev;
+	struct v4l2_m2m_ctx	*m2m_ctx;
+	struct v4l2_ctrl_handler hdl;
+
+	unsigned int		sequence;		/* current frame/field seq */
+	unsigned int		aborting;		/* abort after next irq */
+
+	unsigned int		bufs_per_job;		/* input buffers per batch */
+	unsigned int		bufs_completed;		/* bufs done in this batch */
+
+	struct vpe_q_data	q_data[2];		/* src & dst queue data */
+	struct vb2_buffer	*src_vb;
+	struct vb2_buffer	*dst_vb;
+
+	struct vpdma_buf	mmr_adb;		/* shadow reg addr/data block */
+	struct vpdma_desc_list	desc_list;		/* DMA descriptor list */
+
+	bool			load_mmrs;		/* have new shadow reg values */
+};
+
+
+/*
+ * M2M devices get 2 queues.
+ * Return the queue given the type.
+ */
+static struct vpe_q_data *get_q_data(struct vpe_ctx *ctx,
+				     enum v4l2_buf_type type)
+{
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE:
+		return &ctx->q_data[Q_DATA_SRC];
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE:
+		return &ctx->q_data[Q_DATA_DST];
+	default:
+		BUG();
+	}
+	return NULL;
+}
+
+static u32 read_reg(struct vpe_dev *dev, int offset)
+{
+	return ioread32(dev->base + offset);
+}
+
+static void write_reg(struct vpe_dev *dev, int offset, u32 value)
+{
+	iowrite32(value, dev->base + offset);
+}
+
+/* register field read/write helpers */
+static int get_field(u32 value, u32 mask, int shift)
+{
+	return (value & (mask << shift)) >> shift;
+}
+
+static int read_field_reg(struct vpe_dev *dev, int offset, u32 mask, int shift)
+{
+	return get_field(read_reg(dev, offset), mask, shift);
+}
+
+static void write_field(u32 *valp, u32 field, u32 mask, int shift)
+{
+	u32 val = *valp;
+
+	val &= ~(mask << shift);
+	val |= (field & mask) << shift;
+	*valp = val;
+}
+
+static void write_field_reg(struct vpe_dev *dev, int offset, u32 field,
+		u32 mask, int shift)
+{
+	u32 val = read_reg(dev, offset);
+
+	write_field(&val, field, mask, shift);
+
+	write_reg(dev, offset, val);
+}
+
+/*
+ * DMA address/data block for the shadow registers
+ */
+struct vpe_mmr_adb {
+	struct vpdma_adb_hdr	out_fmt_hdr;
+	u32			out_fmt_reg[1];
+	u32			out_fmt_pad[3];
+	struct vpdma_adb_hdr	us1_hdr;
+	u32			us1_regs[8];
+	struct vpdma_adb_hdr	us2_hdr;
+	u32			us2_regs[8];
+	struct vpdma_adb_hdr	us3_hdr;
+	u32			us3_regs[8];
+	struct vpdma_adb_hdr	dei_hdr;
+	u32			dei_regs[1];
+	u32			dei_pad[3];
+	struct vpdma_adb_hdr	sc_hdr;
+	u32			sc_regs[1];
+	u32			sc_pad[3];
+	struct vpdma_adb_hdr	csc_hdr;
+	u32			csc_regs[6];
+	u32			csc_pad[2];
+};
+
+#define VPE_SET_MMR_ADB_HDR(ctx, hdr, regs, offset_a)	\
+	VPDMA_SET_MMR_ADB_HDR(ctx->mmr_adb, vpe_mmr_adb, hdr, regs, offset_a)
+/*
+ * Set the headers for all of the address/data block structures.
+ */
+static void init_adb_hdrs(struct vpe_ctx *ctx)
+{
+	VPE_SET_MMR_ADB_HDR(ctx, out_fmt_hdr, out_fmt_reg, VPE_CLK_FORMAT_SELECT);
+	VPE_SET_MMR_ADB_HDR(ctx, us1_hdr, us1_regs, VPE_US1_R0);
+	VPE_SET_MMR_ADB_HDR(ctx, us2_hdr, us2_regs, VPE_US2_R0);
+	VPE_SET_MMR_ADB_HDR(ctx, us3_hdr, us3_regs, VPE_US3_R0);
+	VPE_SET_MMR_ADB_HDR(ctx, dei_hdr, dei_regs, VPE_DEI_FRAME_SIZE);
+	VPE_SET_MMR_ADB_HDR(ctx, sc_hdr, sc_regs, VPE_SC_MP_SC0);
+	VPE_SET_MMR_ADB_HDR(ctx, csc_hdr, csc_regs, VPE_CSC_CSC00);
+};
+
+/*
+ * Enable or disable the VPE clocks
+ */
+static void vpe_set_clock_enable(struct vpe_dev *dev, bool on)
+{
+	u32 val = 0;
+
+	if (on)
+		val = VPE_DATA_PATH_CLK_ENABLE | VPE_VPEDMA_CLK_ENABLE;
+	write_reg(dev, VPE_CLK_ENABLE, val);
+}
+
+static void vpe_top_reset(struct vpe_dev *dev)
+{
+
+	write_field_reg(dev, VPE_CLK_RESET, 1, VPE_DATA_PATH_CLK_RESET_MASK,
+		VPE_DATA_PATH_CLK_RESET_SHIFT);
+
+	usleep_range(100, 150);
+
+	write_field_reg(dev, VPE_CLK_RESET, 0, VPE_DATA_PATH_CLK_RESET_MASK,
+		VPE_DATA_PATH_CLK_RESET_SHIFT);
+}
+
+static void vpe_top_vpdma_reset(struct vpe_dev *dev)
+{
+	write_field_reg(dev, VPE_CLK_RESET, 1, VPE_VPDMA_CLK_RESET_MASK,
+		VPE_VPDMA_CLK_RESET_SHIFT);
+
+	usleep_range(100, 150);
+
+	write_field_reg(dev, VPE_CLK_RESET, 0, VPE_VPDMA_CLK_RESET_MASK,
+		VPE_VPDMA_CLK_RESET_SHIFT);
+}
+
+/*
+ * Load the correct of upsampler coefficients into the shadow MMRs
+ */
+static void set_us_coefficients(struct vpe_ctx *ctx)
+{
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	u32 *us1_reg = &mmr_adb->us1_regs[0];
+	u32 *us2_reg = &mmr_adb->us2_regs[0];
+	u32 *us3_reg = &mmr_adb->us3_regs[0];
+	const unsigned short *cp, *end_cp;
+
+	cp = &us_coeffs[0].anchor_fid0_c0;
+
+	end_cp = cp + sizeof(us_coeffs[0]) / sizeof(*cp);
+
+	while (cp < end_cp) {
+		write_field(us1_reg, *cp++, VPE_US_C0_MASK, VPE_US_C0_SHIFT);
+		write_field(us1_reg, *cp++, VPE_US_C1_MASK, VPE_US_C1_SHIFT);
+		*us2_reg++ = *us1_reg;
+		*us3_reg++ = *us1_reg++;
+	}
+	ctx->load_mmrs = true;
+}
+
+/*
+ * Set the upsampler config mode and the VPDMA line mode in the shadow MMRs.
+ */
+static void set_cfg_and_line_modes(struct vpe_ctx *ctx)
+{
+	struct vpe_fmt *fmt = ctx->q_data[Q_DATA_SRC].fmt;
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	u32 *us1_reg0 = &mmr_adb->us1_regs[0];
+	u32 *us2_reg0 = &mmr_adb->us2_regs[0];
+	u32 *us3_reg0 = &mmr_adb->us3_regs[0];
+	int line_mode = 1;
+	int cfg_mode = 1;
+
+	/*
+	 * Cfg Mode 0: YUV420 source, enable upsampler, DEI is de-interlacing.
+	 * Cfg Mode 1: YUV422 source, disable upsampler, DEI is de-interlacing.
+	 */
+
+	if (fmt->fourcc == V4L2_PIX_FMT_NV12) {
+		cfg_mode = 0;
+		line_mode = 0;		/* double lines to line buffer */
+	}
+
+	write_field(us1_reg0, cfg_mode, VPE_US_MODE_MASK, VPE_US_MODE_SHIFT);
+	write_field(us2_reg0, cfg_mode, VPE_US_MODE_MASK, VPE_US_MODE_SHIFT);
+	write_field(us3_reg0, cfg_mode, VPE_US_MODE_MASK, VPE_US_MODE_SHIFT);
+
+	/* regs for now */
+	vpdma_set_line_mode(ctx->dev->vpdma, line_mode, VPE_CHAN_CHROMA1_IN);
+
+	/* frame start for input luma */
+	vpdma_set_frame_start_event(ctx->dev->vpdma, VPDMA_FSEVENT_CHANNEL_ACTIVE,
+		VPE_CHAN_LUMA1_IN);
+
+	/* frame start for input chroma */
+	vpdma_set_frame_start_event(ctx->dev->vpdma, VPDMA_FSEVENT_CHANNEL_ACTIVE,
+		VPE_CHAN_CHROMA1_IN);
+
+	ctx->load_mmrs = true;
+}
+
+/*
+ * Set the shadow registers that are modified when the source
+ * format changes.
+ */
+static void set_src_registers(struct vpe_ctx *ctx)
+{
+	set_us_coefficients(ctx);
+}
+
+/*
+ * Set the shadow registers that are modified when the destination
+ * format changes.
+ */
+static void set_dst_registers(struct vpe_ctx *ctx)
+{
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	struct vpe_fmt *fmt = ctx->q_data[Q_DATA_DST].fmt;
+	u32 val = 0;
+
+	/* select RGB path when color space conversion is supported in future */
+	if (fmt->fourcc == V4L2_PIX_FMT_RGB24)
+		val |= VPE_RGB_OUT_SELECT | VPE_CSC_SRC_DEI_SCALER;
+	else if (fmt->fourcc == V4L2_PIX_FMT_NV16)
+		val |= VPE_COLOR_SEPARATE_422;
+
+	/* The source of CHR_DS is always the scaler, whether it's used or not */
+	val |= VPE_DS_SRC_DEI_SCALER;
+
+	if (fmt->fourcc != V4L2_PIX_FMT_NV12)
+		val |= VPE_DS_BYPASS;
+
+	mmr_adb->out_fmt_reg[0] = val;
+
+	ctx->load_mmrs = true;
+}
+
+/*
+ * Set the de-interlacer shadow register values
+ */
+static void set_dei_regs_bypass(struct vpe_ctx *ctx)
+{
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	struct vpe_q_data *s_q_data = &ctx->q_data[Q_DATA_SRC];
+	unsigned int src_h = s_q_data->c_rect.height;
+	unsigned int src_w = s_q_data->c_rect.width;
+	u32 *dei_mmr0 = &mmr_adb->dei_regs[0];
+	u32 val = 0;
+
+	/*
+	 * according to TRM, we should set DEI in progressive bypass mode when
+	 * the input content is progressive, however, DEI is bypassed correctly
+	 * for both progressive and interlace content in interlace bypass mode.
+	 * It has been recommended not to use progressive bypass mode.
+	 */
+	val = VPE_DEI_INTERLACE_BYPASS;
+
+	val |= (src_h << VPE_DEI_HEIGHT_SHIFT) |
+		(src_w << VPE_DEI_WIDTH_SHIFT) |
+		VPE_DEI_FIELD_FLUSH;
+
+	*dei_mmr0 = val;
+
+	ctx->load_mmrs = true;
+}
+
+static void set_csc_coeff_bypass(struct vpe_ctx *ctx)
+{
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	u32 *shadow_csc_reg5 = &mmr_adb->csc_regs[5];
+
+	*shadow_csc_reg5 |= VPE_CSC_BYPASS;
+
+	ctx->load_mmrs = true;
+}
+
+static void set_sc_regs_bypass(struct vpe_ctx *ctx)
+{
+	struct vpe_mmr_adb *mmr_adb = ctx->mmr_adb.addr;
+	u32 *sc_reg0 = &mmr_adb->sc_regs[0];
+	u32 val = 0;
+
+	val |= VPE_SC_BYPASS;
+	*sc_reg0 = val;
+
+	ctx->load_mmrs = true;
+}
+
+/*
+ * Set the shadow registers whose values are modified when either the
+ * source or destination format is changed.
+ */
+static int set_srcdst_params(struct vpe_ctx *ctx)
+{
+	ctx->sequence = 0;
+
+	set_cfg_and_line_modes(ctx);
+	set_dei_regs_bypass(ctx);
+	set_csc_coeff_bypass(ctx);
+	set_sc_regs_bypass(ctx);
+
+	return 0;
+}
+
+/*
+ * Return the vpe_ctx structure for a given struct file
+ */
+static struct vpe_ctx *file2ctx(struct file *file)
+{
+	return container_of(file->private_data, struct vpe_ctx, fh);
+}
+
+/*
+ * mem2mem callbacks
+ */
+
+/**
+ * job_ready() - check whether an instance is ready to be scheduled to run
+ */
+static int job_ready(void *priv)
+{
+	struct vpe_ctx *ctx = priv;
+	int needed = ctx->bufs_per_job;
+
+	if (v4l2_m2m_num_src_bufs_ready(ctx->m2m_ctx) < needed)
+		return 0;
+
+	return 1;
+}
+
+static void job_abort(void *priv)
+{
+	struct vpe_ctx *ctx = priv;
+
+	/* Will cancel the transaction in the next interrupt handler */
+	ctx->aborting = 1;
+}
+
+/*
+ * Lock access to the device
+ */
+static void vpe_lock(void *priv)
+{
+	struct vpe_ctx *ctx = priv;
+	struct vpe_dev *dev = ctx->dev;
+	mutex_lock(&dev->dev_mutex);
+}
+
+static void vpe_unlock(void *priv)
+{
+	struct vpe_ctx *ctx = priv;
+	struct vpe_dev *dev = ctx->dev;
+	mutex_unlock(&dev->dev_mutex);
+}
+
+static void vpe_dump_regs(struct vpe_dev *dev)
+{
+#define DUMPREG(r) vpe_dbg(dev, "%-35s %08x\n", #r, read_reg(dev, VPE_##r))
+
+	vpe_dbg(dev, "VPE Registers:\n");
+
+	DUMPREG(PID);
+	DUMPREG(SYSCONFIG);
+	DUMPREG(INT0_STATUS0_RAW);
+	DUMPREG(INT0_STATUS0);
+	DUMPREG(INT0_ENABLE0);
+	DUMPREG(INT0_STATUS1_RAW);
+	DUMPREG(INT0_STATUS1);
+	DUMPREG(INT0_ENABLE1);
+	DUMPREG(CLK_ENABLE);
+	DUMPREG(CLK_RESET);
+	DUMPREG(CLK_FORMAT_SELECT);
+	DUMPREG(CLK_RANGE_MAP);
+	DUMPREG(US1_R0);
+	DUMPREG(US1_R1);
+	DUMPREG(US1_R2);
+	DUMPREG(US1_R3);
+	DUMPREG(US1_R4);
+	DUMPREG(US1_R5);
+	DUMPREG(US1_R6);
+	DUMPREG(US1_R7);
+	DUMPREG(US2_R0);
+	DUMPREG(US2_R1);
+	DUMPREG(US2_R2);
+	DUMPREG(US2_R3);
+	DUMPREG(US2_R4);
+	DUMPREG(US2_R5);
+	DUMPREG(US2_R6);
+	DUMPREG(US2_R7);
+	DUMPREG(US3_R0);
+	DUMPREG(US3_R1);
+	DUMPREG(US3_R2);
+	DUMPREG(US3_R3);
+	DUMPREG(US3_R4);
+	DUMPREG(US3_R5);
+	DUMPREG(US3_R6);
+	DUMPREG(US3_R7);
+	DUMPREG(DEI_FRAME_SIZE);
+	DUMPREG(MDT_BYPASS);
+	DUMPREG(MDT_SF_THRESHOLD);
+	DUMPREG(EDI_CONFIG);
+	DUMPREG(DEI_EDI_LUT_R0);
+	DUMPREG(DEI_EDI_LUT_R1);
+	DUMPREG(DEI_EDI_LUT_R2);
+	DUMPREG(DEI_EDI_LUT_R3);
+	DUMPREG(DEI_FMD_WINDOW_R0);
+	DUMPREG(DEI_FMD_WINDOW_R1);
+	DUMPREG(DEI_FMD_CONTROL_R0);
+	DUMPREG(DEI_FMD_CONTROL_R1);
+	DUMPREG(DEI_FMD_STATUS_R0);
+	DUMPREG(DEI_FMD_STATUS_R1);
+	DUMPREG(DEI_FMD_STATUS_R2);
+	DUMPREG(SC_MP_SC0);
+	DUMPREG(SC_MP_SC1);
+	DUMPREG(SC_MP_SC2);
+	DUMPREG(SC_MP_SC3);
+	DUMPREG(SC_MP_SC4);
+	DUMPREG(SC_MP_SC5);
+	DUMPREG(SC_MP_SC6);
+	DUMPREG(SC_MP_SC8);
+	DUMPREG(SC_MP_SC9);
+	DUMPREG(SC_MP_SC10);
+	DUMPREG(SC_MP_SC11);
+	DUMPREG(SC_MP_SC12);
+	DUMPREG(SC_MP_SC13);
+	DUMPREG(SC_MP_SC17);
+	DUMPREG(SC_MP_SC18);
+	DUMPREG(SC_MP_SC19);
+	DUMPREG(SC_MP_SC20);
+	DUMPREG(SC_MP_SC21);
+	DUMPREG(SC_MP_SC22);
+	DUMPREG(SC_MP_SC23);
+	DUMPREG(SC_MP_SC24);
+	DUMPREG(SC_MP_SC25);
+	DUMPREG(CSC_CSC00);
+	DUMPREG(CSC_CSC01);
+	DUMPREG(CSC_CSC02);
+	DUMPREG(CSC_CSC03);
+	DUMPREG(CSC_CSC04);
+	DUMPREG(CSC_CSC05);
+#undef DUMPREG
+}
+
+static void add_out_dtd(struct vpe_ctx *ctx, int port)
+{
+	struct vpe_q_data *q_data = &ctx->q_data[Q_DATA_DST];
+	const struct vpe_port_data *p_data = &port_data[port];
+	struct vb2_buffer *vb = ctx->dst_vb;
+	struct v4l2_rect *c_rect = &q_data->c_rect;
+	struct vpe_fmt *fmt = q_data->fmt;
+	const struct vpdma_data_format *vpdma_fmt;
+	int plane = fmt->coplanar ? p_data->vb_part : 0;
+	dma_addr_t dma_addr;
+	u32 flags = 0;
+
+	vpdma_fmt = fmt->vpdma_fmt[plane];
+	dma_addr = vb2_dma_contig_plane_dma_addr(vb, plane);
+	if (!dma_addr) {
+		vpe_err(ctx->dev,
+			"acquiring output buffer(%d) dma_addr failed\n",
+			port);
+		return;
+	}
+
+	if (q_data->flags & Q_DATA_FRAME_1D)
+		flags |= VPDMA_DATA_FRAME_1D;
+	if (q_data->flags & Q_DATA_MODE_TILED)
+		flags |= VPDMA_DATA_MODE_TILED;
+
+	vpdma_add_out_dtd(&ctx->desc_list, c_rect, vpdma_fmt, dma_addr,
+		p_data->channel, flags);
+}
+
+static void add_in_dtd(struct vpe_ctx *ctx, int port)
+{
+	struct vpe_q_data *q_data = &ctx->q_data[Q_DATA_SRC];
+	const struct vpe_port_data *p_data = &port_data[port];
+	struct vb2_buffer *vb = ctx->src_vb;
+	struct v4l2_rect *c_rect = &q_data->c_rect;
+	struct vpe_fmt *fmt = q_data->fmt;
+	const struct vpdma_data_format *vpdma_fmt;
+	int plane = fmt->coplanar ? p_data->vb_part : 0;
+	int field = 0;
+	dma_addr_t dma_addr;
+	u32 flags = 0;
+
+	vpdma_fmt = fmt->vpdma_fmt[plane];
+
+	dma_addr = vb2_dma_contig_plane_dma_addr(vb, plane);
+	if (!dma_addr) {
+		vpe_err(ctx->dev,
+			"acquiring input buffer(%d) dma_addr failed\n",
+			port);
+		return;
+	}
+
+	if (q_data->flags & Q_DATA_FRAME_1D)
+		flags |= VPDMA_DATA_FRAME_1D;
+	if (q_data->flags & Q_DATA_MODE_TILED)
+		flags |= VPDMA_DATA_MODE_TILED;
+
+	vpdma_add_in_dtd(&ctx->desc_list, q_data->width, q_data->height,
+		c_rect, vpdma_fmt, dma_addr, p_data->channel, field, flags);
+}
+
+/*
+ * Enable the expected IRQ sources
+ */
+static void enable_irqs(struct vpe_ctx *ctx)
+{
+	write_reg(ctx->dev, VPE_INT0_ENABLE0_SET, VPE_INT0_LIST0_COMPLETE);
+	write_reg(ctx->dev, VPE_INT0_ENABLE1_SET, VPE_DS1_UV_ERROR_INT);
+
+	vpdma_enable_list_complete_irq(ctx->dev->vpdma, 0, true);
+}
+
+static void disable_irqs(struct vpe_ctx *ctx)
+{
+	write_reg(ctx->dev, VPE_INT0_ENABLE0_CLR, 0xffffffff);
+	write_reg(ctx->dev, VPE_INT0_ENABLE1_CLR, 0xffffffff);
+
+	vpdma_enable_list_complete_irq(ctx->dev->vpdma, 0, false);
+}
+
+/* device_run() - prepares and starts the device
+ *
+ * This function is only called when both the source and destination
+ * buffers are in place.
+ */
+static void device_run(void *priv)
+{
+	struct vpe_ctx *ctx = priv;
+	struct vpe_q_data *d_q_data = &ctx->q_data[Q_DATA_DST];
+
+	ctx->src_vb = v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
+	WARN_ON(ctx->src_vb == NULL);
+	ctx->dst_vb = v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
+	WARN_ON(ctx->dst_vb == NULL);
+
+	/* config descriptors */
+	if (ctx->dev->loaded_mmrs != ctx->mmr_adb.dma_addr || ctx->load_mmrs) {
+		vpdma_map_desc_buf(ctx->dev->vpdma, &ctx->mmr_adb);
+		vpdma_add_cfd_adb(&ctx->desc_list, CFD_MMR_CLIENT, &ctx->mmr_adb);
+		ctx->dev->loaded_mmrs = ctx->mmr_adb.dma_addr;
+		ctx->load_mmrs = false;
+	}
+
+	add_out_dtd(ctx, VPE_PORT_LUMA_OUT);
+	if (d_q_data->fmt->coplanar)
+		add_out_dtd(ctx, VPE_PORT_CHROMA_OUT);
+
+	add_in_dtd(ctx, VPE_PORT_LUMA1_IN);
+	add_in_dtd(ctx, VPE_PORT_CHROMA1_IN);
+
+	/* sync on channel control descriptors for input ports */
+	vpdma_add_sync_on_channel_ctd(&ctx->desc_list, VPE_CHAN_LUMA1_IN);
+	vpdma_add_sync_on_channel_ctd(&ctx->desc_list, VPE_CHAN_CHROMA1_IN);
+
+	/* sync on channel control descriptors for output ports */
+	vpdma_add_sync_on_channel_ctd(&ctx->desc_list, VPE_CHAN_LUMA_OUT);
+	if (d_q_data->fmt->coplanar)
+		vpdma_add_sync_on_channel_ctd(&ctx->desc_list, VPE_CHAN_CHROMA_OUT);
+
+	enable_irqs(ctx);
+
+	vpdma_map_desc_buf(ctx->dev->vpdma, &ctx->desc_list.buf);
+	vpdma_submit_descs(ctx->dev->vpdma, &ctx->desc_list);
+}
+
+static void ds1_uv_error(struct vpe_ctx *ctx)
+{
+	dev_warn(ctx->dev->v4l2_dev.dev,
+		"received downsampler error interrupt\n");
+}
+
+static irqreturn_t vpe_irq(int irq_vpe, void *data)
+{
+	struct vpe_dev *dev = (struct vpe_dev *)data;
+	struct vpe_ctx *ctx;
+	struct vb2_buffer *s_vb, *d_vb;
+	struct v4l2_buffer *s_buf, *d_buf;
+	unsigned long flags;
+	u32 irqst0, irqst1;
+
+	irqst0 = read_reg(dev, VPE_INT0_STATUS0);
+	if (irqst0) {
+		write_reg(dev, VPE_INT0_STATUS0_CLR, irqst0);
+		vpe_dbg(dev, "INT0_STATUS0 = 0x%08x\n", irqst0);
+	}
+
+	irqst1 = read_reg(dev, VPE_INT0_STATUS1);
+	if (irqst1) {
+		write_reg(dev, VPE_INT0_STATUS1_CLR, irqst1);
+		vpe_dbg(dev, "INT0_STATUS1 = 0x%08x\n", irqst1);
+	}
+
+	ctx = v4l2_m2m_get_curr_priv(dev->m2m_dev);
+	if (!ctx) {
+		vpe_err(dev, "instance released before end of transaction\n");
+		goto handled;
+	}
+
+	if (irqst1 & VPE_DS1_UV_ERROR_INT) {
+		irqst1 &= ~VPE_DS1_UV_ERROR_INT;
+		ds1_uv_error(ctx);
+	}
+
+	if (irqst0) {
+		if (irqst0 & VPE_INT0_LIST0_COMPLETE)
+			vpdma_clear_list_stat(ctx->dev->vpdma);
+
+		irqst0 &= ~(VPE_INT0_LIST0_COMPLETE);
+	}
+
+	if (irqst0 | irqst1) {
+		dev_warn(dev->v4l2_dev.dev, "Unexpected interrupt: "
+			"INT0_STATUS0 = 0x%08x, INT0_STATUS1 = 0x%08x\n",
+			irqst0, irqst1);
+	}
+
+	disable_irqs(ctx);
+
+	vpdma_unmap_desc_buf(dev->vpdma, &ctx->desc_list.buf);
+	vpdma_unmap_desc_buf(dev->vpdma, &ctx->mmr_adb);
+
+	vpdma_reset_desc_list(&ctx->desc_list);
+
+	if (ctx->aborting)
+		goto finished;
+
+	s_vb = ctx->src_vb;
+	d_vb = ctx->dst_vb;
+	s_buf = &s_vb->v4l2_buf;
+	d_buf = &d_vb->v4l2_buf;
+
+	d_buf->timestamp = s_buf->timestamp;
+	if (s_buf->flags & V4L2_BUF_FLAG_TIMECODE) {
+		d_buf->flags |= V4L2_BUF_FLAG_TIMECODE;
+		d_buf->timecode = s_buf->timecode;
+	}
+
+	d_buf->sequence = ctx->sequence;
+
+	ctx->sequence++;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	v4l2_m2m_buf_done(s_vb, VB2_BUF_STATE_DONE);
+	v4l2_m2m_buf_done(d_vb, VB2_BUF_STATE_DONE);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	ctx->bufs_completed++;
+	if (ctx->bufs_completed < ctx->bufs_per_job) {
+		device_run(ctx);
+		goto handled;
+	}
+
+finished:
+	vpe_dbg(ctx->dev, "finishing transaction\n");
+	ctx->bufs_completed = 0;
+	v4l2_m2m_job_finish(dev->m2m_dev, ctx->m2m_ctx);
+handled:
+	return IRQ_HANDLED;
+}
+
+/*
+ * video ioctls
+ */
+static int vpe_querycap(struct file *file, void *priv,
+			struct v4l2_capability *cap)
+{
+	strncpy(cap->driver, VPE_MODULE_NAME, sizeof(cap->driver) - 1);
+	strncpy(cap->card, VPE_MODULE_NAME, sizeof(cap->card) - 1);
+	strlcpy(cap->bus_info, VPE_MODULE_NAME, sizeof(cap->bus_info));
+	cap->device_caps  = V4L2_CAP_VIDEO_M2M | V4L2_CAP_STREAMING;
+	cap->capabilities = cap->device_caps | V4L2_CAP_DEVICE_CAPS;
+	return 0;
+}
+
+static int __enum_fmt(struct v4l2_fmtdesc *f, u32 type)
+{
+	int i, index;
+	struct vpe_fmt *fmt = NULL;
+
+	index = 0;
+	for (i = 0; i < ARRAY_SIZE(vpe_formats); ++i) {
+		if (vpe_formats[i].types & type) {
+			if (index == f->index) {
+				fmt = &vpe_formats[i];
+				break;
+			}
+			index++;
+		}
+	}
+
+	if (!fmt)
+		return -EINVAL;
+
+	strncpy(f->description, fmt->name, sizeof(f->description) - 1);
+	f->pixelformat = fmt->fourcc;
+	return 0;
+}
+
+static int vpe_enum_fmt(struct file *file, void *priv,
+				struct v4l2_fmtdesc *f)
+{
+	if (V4L2_TYPE_IS_OUTPUT(f->type))
+		return __enum_fmt(f, VPE_FMT_TYPE_OUTPUT);
+
+	return __enum_fmt(f, VPE_FMT_TYPE_CAPTURE);
+}
+
+static int vpe_g_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct v4l2_pix_format_mplane *pix = &f->fmt.pix_mp;
+	struct vpe_ctx *ctx = file2ctx(file);
+	struct vb2_queue *vq;
+	struct vpe_q_data *q_data;
+	int i;
+
+	vq = v4l2_m2m_get_vq(ctx->m2m_ctx, f->type);
+	if (!vq)
+		return -EINVAL;
+
+	q_data = get_q_data(ctx, f->type);
+
+	pix->width = q_data->width;
+	pix->height = q_data->height;
+	pix->pixelformat = q_data->fmt->fourcc;
+
+	if (V4L2_TYPE_IS_OUTPUT(f->type)) {
+		pix->colorspace = q_data->colorspace;
+	} else {
+		struct vpe_q_data *s_q_data;
+
+		/* get colorspace from the source queue */
+		s_q_data = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
+
+		pix->colorspace = s_q_data->colorspace;
+	}
+
+	pix->num_planes = q_data->fmt->coplanar ? 2 : 1;
+
+	for (i = 0; i < pix->num_planes; i++) {
+		pix->plane_fmt[i].bytesperline = q_data->bytesperline[i];
+		pix->plane_fmt[i].sizeimage = q_data->sizeimage[i];
+	}
+
+	return 0;
+}
+
+static int __vpe_try_fmt(struct vpe_ctx *ctx, struct v4l2_format *f,
+		       struct vpe_fmt *fmt, int type)
+{
+	struct v4l2_pix_format_mplane *pix = &f->fmt.pix_mp;
+	struct v4l2_plane_pix_format *plane_fmt;
+	int i;
+
+	if (!fmt || !(fmt->types & type)) {
+		vpe_err(ctx->dev, "Fourcc format (0x%08x) invalid.\n",
+			pix->pixelformat);
+		return -EINVAL;
+	}
+
+	pix->field = V4L2_FIELD_NONE;
+
+	v4l_bound_align_image(&pix->width, MIN_W, MAX_W, W_ALIGN,
+			      &pix->height, MIN_H, MAX_H, H_ALIGN,
+			      S_ALIGN);
+
+	pix->num_planes = fmt->coplanar ? 2 : 1;
+	pix->pixelformat = fmt->fourcc;
+
+	if (type == VPE_FMT_TYPE_CAPTURE) {
+		struct vpe_q_data *s_q_data;
+
+		/* get colorspace from the source queue */
+		s_q_data = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
+
+		pix->colorspace = s_q_data->colorspace;
+	} else {
+		if (!pix->colorspace)
+			pix->colorspace = V4L2_COLORSPACE_SMPTE240M;
+	}
+
+	for (i = 0; i < pix->num_planes; i++) {
+		int depth;
+
+		plane_fmt = &pix->plane_fmt[i];
+		depth = fmt->vpdma_fmt[i]->depth;
+
+		if (i == VPE_LUMA)
+			plane_fmt->bytesperline =
+					round_up((pix->width * depth) >> 3,
+						1 << L_ALIGN);
+		else
+			plane_fmt->bytesperline = pix->width;
+
+		plane_fmt->sizeimage =
+				(pix->height * pix->width * depth) >> 3;
+	}
+
+	return 0;
+}
+
+static int vpe_try_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+	struct vpe_fmt *fmt = find_format(f);
+
+	if (V4L2_TYPE_IS_OUTPUT(f->type))
+		return __vpe_try_fmt(ctx, f, fmt, VPE_FMT_TYPE_OUTPUT);
+	else
+		return __vpe_try_fmt(ctx, f, fmt, VPE_FMT_TYPE_CAPTURE);
+}
+
+static int __vpe_s_fmt(struct vpe_ctx *ctx, struct v4l2_format *f)
+{
+	struct v4l2_pix_format_mplane *pix = &f->fmt.pix_mp;
+	struct v4l2_plane_pix_format *plane_fmt;
+	struct vpe_q_data *q_data;
+	struct vb2_queue *vq;
+	int i;
+
+	vq = v4l2_m2m_get_vq(ctx->m2m_ctx, f->type);
+	if (!vq)
+		return -EINVAL;
+
+	if (vb2_is_busy(vq)) {
+		vpe_err(ctx->dev, "queue busy\n");
+		return -EBUSY;
+	}
+
+	q_data = get_q_data(ctx, f->type);
+	if (!q_data)
+		return -EINVAL;
+
+	q_data->fmt		= find_format(f);
+	q_data->width		= pix->width;
+	q_data->height		= pix->height;
+	q_data->colorspace	= pix->colorspace;
+
+	for (i = 0; i < pix->num_planes; i++) {
+		plane_fmt = &pix->plane_fmt[i];
+
+		q_data->bytesperline[i]	= plane_fmt->bytesperline;
+		q_data->sizeimage[i]	= plane_fmt->sizeimage;
+	}
+
+	q_data->c_rect.left	= 0;
+	q_data->c_rect.top	= 0;
+	q_data->c_rect.width	= q_data->width;
+	q_data->c_rect.height	= q_data->height;
+
+	vpe_dbg(ctx->dev, "Setting format for type %d, wxh: %dx%d, fmt: %d bpl_y %d",
+		f->type, q_data->width, q_data->height, q_data->fmt->fourcc,
+		q_data->bytesperline[VPE_LUMA]);
+	if (q_data->fmt->coplanar)
+		vpe_dbg(ctx->dev, " bpl_uv %d\n",
+			q_data->bytesperline[VPE_CHROMA]);
+
+	return 0;
+}
+
+static int vpe_s_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	int ret;
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	ret = vpe_try_fmt(file, priv, f);
+	if (ret)
+		return ret;
+
+	ret = __vpe_s_fmt(ctx, f);
+	if (ret)
+		return ret;
+
+	if (V4L2_TYPE_IS_OUTPUT(f->type))
+		set_src_registers(ctx);
+	else
+		set_dst_registers(ctx);
+
+	return set_srcdst_params(ctx);
+}
+
+static int vpe_reqbufs(struct file *file, void *priv,
+		       struct v4l2_requestbuffers *reqbufs)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	return v4l2_m2m_reqbufs(file, ctx->m2m_ctx, reqbufs);
+}
+
+static int vpe_querybuf(struct file *file, void *priv, struct v4l2_buffer *buf)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	return v4l2_m2m_querybuf(file, ctx->m2m_ctx, buf);
+}
+
+static int vpe_qbuf(struct file *file, void *priv, struct v4l2_buffer *buf)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	return v4l2_m2m_qbuf(file, ctx->m2m_ctx, buf);
+}
+
+static int vpe_dqbuf(struct file *file, void *priv, struct v4l2_buffer *buf)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	return v4l2_m2m_dqbuf(file, ctx->m2m_ctx, buf);
+}
+
+static int vpe_streamon(struct file *file, void *priv, enum v4l2_buf_type type)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	return v4l2_m2m_streamon(file, ctx->m2m_ctx, type);
+}
+
+static int vpe_streamoff(struct file *file, void *priv, enum v4l2_buf_type type)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	vpe_dump_regs(ctx->dev);
+	vpdma_dump_regs(ctx->dev->vpdma);
+
+	return v4l2_m2m_streamoff(file, ctx->m2m_ctx, type);
+}
+
+/*
+ * defines number of buffers/frames a context can process with VPE before
+ * switching to a different context. default value is 1 buffer per context
+ */
+#define V4L2_CID_VPE_BUFS_PER_JOB		(V4L2_CID_USER_TI_VPE_BASE + 0)
+
+static int vpe_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct vpe_ctx *ctx =
+		container_of(ctrl->handler, struct vpe_ctx, hdl);
+
+	switch (ctrl->id) {
+	case V4L2_CID_VPE_BUFS_PER_JOB:
+		ctx->bufs_per_job = ctrl->val;
+		break;
+
+	default:
+		vpe_err(ctx->dev, "Invalid control\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct v4l2_ctrl_ops vpe_ctrl_ops = {
+	.s_ctrl = vpe_s_ctrl,
+};
+
+static const struct v4l2_ioctl_ops vpe_ioctl_ops = {
+	.vidioc_querycap	= vpe_querycap,
+
+	.vidioc_enum_fmt_vid_cap_mplane = vpe_enum_fmt,
+	.vidioc_g_fmt_vid_cap_mplane	= vpe_g_fmt,
+	.vidioc_try_fmt_vid_cap_mplane	= vpe_try_fmt,
+	.vidioc_s_fmt_vid_cap_mplane	= vpe_s_fmt,
+
+	.vidioc_enum_fmt_vid_out_mplane = vpe_enum_fmt,
+	.vidioc_g_fmt_vid_out_mplane	= vpe_g_fmt,
+	.vidioc_try_fmt_vid_out_mplane	= vpe_try_fmt,
+	.vidioc_s_fmt_vid_out_mplane	= vpe_s_fmt,
+
+	.vidioc_reqbufs		= vpe_reqbufs,
+	.vidioc_querybuf	= vpe_querybuf,
+
+	.vidioc_qbuf		= vpe_qbuf,
+	.vidioc_dqbuf		= vpe_dqbuf,
+
+	.vidioc_streamon	= vpe_streamon,
+	.vidioc_streamoff	= vpe_streamoff,
+	.vidioc_subscribe_event = v4l2_ctrl_subscribe_event,
+	.vidioc_unsubscribe_event = v4l2_event_unsubscribe,
+};
+
+/*
+ * Queue operations
+ */
+static int vpe_queue_setup(struct vb2_queue *vq,
+			   const struct v4l2_format *fmt,
+			   unsigned int *nbuffers, unsigned int *nplanes,
+			   unsigned int sizes[], void *alloc_ctxs[])
+{
+	int i;
+	struct vpe_ctx *ctx = vb2_get_drv_priv(vq);
+	struct vpe_q_data *q_data;
+
+	q_data = get_q_data(ctx, vq->type);
+
+	*nplanes = q_data->fmt->coplanar ? 2 : 1;
+
+	for (i = 0; i < *nplanes; i++) {
+		sizes[i] = q_data->sizeimage[i];
+		alloc_ctxs[i] = ctx->dev->alloc_ctx;
+	}
+
+	vpe_dbg(ctx->dev, "get %d buffer(s) of size %d", *nbuffers,
+		sizes[VPE_LUMA]);
+	if (q_data->fmt->coplanar)
+		vpe_dbg(ctx->dev, " and %d\n", sizes[VPE_CHROMA]);
+
+	return 0;
+}
+
+static int vpe_buf_prepare(struct vb2_buffer *vb)
+{
+	struct vpe_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+	struct vpe_q_data *q_data;
+	int i, num_planes;
+
+	vpe_dbg(ctx->dev, "type: %d\n", vb->vb2_queue->type);
+
+	q_data = get_q_data(ctx, vb->vb2_queue->type);
+	num_planes = q_data->fmt->coplanar ? 2 : 1;
+
+	for (i = 0; i < num_planes; i++) {
+		if (vb2_plane_size(vb, i) < q_data->sizeimage[i]) {
+			vpe_err(ctx->dev,
+				"data will not fit into plane (%lu < %lu)\n",
+				vb2_plane_size(vb, i),
+				(long) q_data->sizeimage[i]);
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < num_planes; i++)
+		vb2_set_plane_payload(vb, i, q_data->sizeimage[i]);
+
+	return 0;
+}
+
+static void vpe_buf_queue(struct vb2_buffer *vb)
+{
+	struct vpe_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+	v4l2_m2m_buf_queue(ctx->m2m_ctx, vb);
+}
+
+static void vpe_wait_prepare(struct vb2_queue *q)
+{
+	struct vpe_ctx *ctx = vb2_get_drv_priv(q);
+	vpe_unlock(ctx);
+}
+
+static void vpe_wait_finish(struct vb2_queue *q)
+{
+	struct vpe_ctx *ctx = vb2_get_drv_priv(q);
+	vpe_lock(ctx);
+}
+
+static struct vb2_ops vpe_qops = {
+	.queue_setup	 = vpe_queue_setup,
+	.buf_prepare	 = vpe_buf_prepare,
+	.buf_queue	 = vpe_buf_queue,
+	.wait_prepare	 = vpe_wait_prepare,
+	.wait_finish	 = vpe_wait_finish,
+};
+
+static int queue_init(void *priv, struct vb2_queue *src_vq,
+		      struct vb2_queue *dst_vq)
+{
+	struct vpe_ctx *ctx = priv;
+	int ret;
+
+	memset(src_vq, 0, sizeof(*src_vq));
+	src_vq->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+	src_vq->io_modes = VB2_MMAP;
+	src_vq->drv_priv = ctx;
+	src_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer);
+	src_vq->ops = &vpe_qops;
+	src_vq->mem_ops = &vb2_dma_contig_memops;
+	src_vq->timestamp_type = V4L2_BUF_FLAG_TIMESTAMP_COPY;
+
+	ret = vb2_queue_init(src_vq);
+	if (ret)
+		return ret;
+
+	memset(dst_vq, 0, sizeof(*dst_vq));
+	dst_vq->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+	dst_vq->io_modes = VB2_MMAP;
+	dst_vq->drv_priv = ctx;
+	dst_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer);
+	dst_vq->ops = &vpe_qops;
+	dst_vq->mem_ops = &vb2_dma_contig_memops;
+	dst_vq->timestamp_type = V4L2_BUF_FLAG_TIMESTAMP_COPY;
+
+	return vb2_queue_init(dst_vq);
+}
+
+static const struct v4l2_ctrl_config vpe_bufs_per_job = {
+	.ops = &vpe_ctrl_ops,
+	.id = V4L2_CID_VPE_BUFS_PER_JOB,
+	.name = "Buffers Per Transaction",
+	.type = V4L2_CTRL_TYPE_INTEGER,
+	.def = VPE_DEF_BUFS_PER_JOB,
+	.min = 1,
+	.max = VIDEO_MAX_FRAME,
+	.step = 1,
+};
+
+/*
+ * File operations
+ */
+static int vpe_open(struct file *file)
+{
+	struct vpe_dev *dev = video_drvdata(file);
+	struct vpe_ctx *ctx = NULL;
+	struct vpe_q_data *s_q_data;
+	struct v4l2_ctrl_handler *hdl;
+	int ret;
+
+	vpe_dbg(dev, "vpe_open\n");
+
+	if (!dev->vpdma->ready) {
+		vpe_err(dev, "vpdma firmware not loaded\n");
+		return -ENODEV;
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+
+	if (mutex_lock_interruptible(&dev->dev_mutex)) {
+		ret = -ERESTARTSYS;
+		goto free_ctx;
+	}
+
+	ret = vpdma_create_desc_list(&ctx->desc_list, VPE_DESC_LIST_SIZE,
+			VPDMA_LIST_TYPE_NORMAL);
+	if (ret != 0)
+		goto unlock;
+
+	ret = vpdma_alloc_desc_buf(&ctx->mmr_adb, sizeof(struct vpe_mmr_adb));
+	if (ret != 0)
+		goto free_desc_list;
+
+	init_adb_hdrs(ctx);
+
+	v4l2_fh_init(&ctx->fh, video_devdata(file));
+	file->private_data = &ctx->fh;
+
+	hdl = &ctx->hdl;
+	v4l2_ctrl_handler_init(hdl, 1);
+	v4l2_ctrl_new_custom(hdl, &vpe_bufs_per_job, NULL);
+	if (hdl->error) {
+		ret = hdl->error;
+		goto exit_fh;
+	}
+	ctx->fh.ctrl_handler = hdl;
+	v4l2_ctrl_handler_setup(hdl);
+
+	s_q_data = &ctx->q_data[Q_DATA_SRC];
+	s_q_data->fmt = &vpe_formats[2];
+	s_q_data->width = 1920;
+	s_q_data->height = 1080;
+	s_q_data->sizeimage[VPE_LUMA] = (s_q_data->width * s_q_data->height *
+			s_q_data->fmt->vpdma_fmt[VPE_LUMA]->depth) >> 3;
+	s_q_data->colorspace = V4L2_COLORSPACE_SMPTE240M;
+	s_q_data->c_rect.left = 0;
+	s_q_data->c_rect.top = 0;
+	s_q_data->c_rect.width = s_q_data->width;
+	s_q_data->c_rect.height = s_q_data->height;
+	s_q_data->flags = 0;
+
+	ctx->q_data[Q_DATA_DST] = *s_q_data;
+
+	set_src_registers(ctx);
+	set_dst_registers(ctx);
+	ret = set_srcdst_params(ctx);
+	if (ret)
+		goto exit_fh;
+
+	ctx->m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx, &queue_init);
+
+	if (IS_ERR(ctx->m2m_ctx)) {
+		ret = PTR_ERR(ctx->m2m_ctx);
+		goto exit_fh;
+	}
+
+	v4l2_fh_add(&ctx->fh);
+
+	/*
+	 * for now, just report the creation of the first instance, we can later
+	 * optimize the driver to enable or disable clocks when the first
+	 * instance is created or the last instance released
+	 */
+	if (atomic_inc_return(&dev->num_instances) == 1)
+		vpe_dbg(dev, "first instance created\n");
+
+	ctx->bufs_per_job = VPE_DEF_BUFS_PER_JOB;
+
+	ctx->load_mmrs = true;
+
+	vpe_dbg(dev, "created instance %p, m2m_ctx: %p\n",
+		ctx, ctx->m2m_ctx);
+
+	mutex_unlock(&dev->dev_mutex);
+
+	return 0;
+exit_fh:
+	v4l2_ctrl_handler_free(hdl);
+	v4l2_fh_exit(&ctx->fh);
+	vpdma_free_desc_buf(&ctx->mmr_adb);
+free_desc_list:
+	vpdma_free_desc_list(&ctx->desc_list);
+unlock:
+	mutex_unlock(&dev->dev_mutex);
+free_ctx:
+	kfree(ctx);
+	return ret;
+}
+
+static int vpe_release(struct file *file)
+{
+	struct vpe_dev *dev = video_drvdata(file);
+	struct vpe_ctx *ctx = file2ctx(file);
+
+	vpe_dbg(dev, "releasing instance %p\n", ctx);
+
+	mutex_lock(&dev->dev_mutex);
+	vpdma_free_desc_list(&ctx->desc_list);
+	vpdma_free_desc_buf(&ctx->mmr_adb);
+
+	v4l2_fh_del(&ctx->fh);
+	v4l2_fh_exit(&ctx->fh);
+	v4l2_ctrl_handler_free(&ctx->hdl);
+	v4l2_m2m_ctx_release(ctx->m2m_ctx);
+
+	kfree(ctx);
+
+	/*
+	 * for now, just report the release of the last instance, we can later
+	 * optimize the driver to enable or disable clocks when the first
+	 * instance is created or the last instance released
+	 */
+	if (atomic_dec_return(&dev->num_instances) == 0)
+		vpe_dbg(dev, "last instance released\n");
+
+	mutex_unlock(&dev->dev_mutex);
+
+	return 0;
+}
+
+static unsigned int vpe_poll(struct file *file,
+			     struct poll_table_struct *wait)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+	struct vpe_dev *dev = ctx->dev;
+	int ret;
+
+	mutex_lock(&dev->dev_mutex);
+	ret = v4l2_m2m_poll(file, ctx->m2m_ctx, wait);
+	mutex_unlock(&dev->dev_mutex);
+	return ret;
+}
+
+static int vpe_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct vpe_ctx *ctx = file2ctx(file);
+	struct vpe_dev *dev = ctx->dev;
+	int ret;
+
+	if (mutex_lock_interruptible(&dev->dev_mutex))
+		return -ERESTARTSYS;
+	ret = v4l2_m2m_mmap(file, ctx->m2m_ctx, vma);
+	mutex_unlock(&dev->dev_mutex);
+	return ret;
+}
+
+static const struct v4l2_file_operations vpe_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vpe_open,
+	.release	= vpe_release,
+	.poll		= vpe_poll,
+	.unlocked_ioctl	= video_ioctl2,
+	.mmap		= vpe_mmap,
+};
+
+static struct video_device vpe_videodev = {
+	.name		= VPE_MODULE_NAME,
+	.fops		= &vpe_fops,
+	.ioctl_ops	= &vpe_ioctl_ops,
+	.minor		= -1,
+	.release	= video_device_release,
+	.vfl_dir	= VFL_DIR_M2M,
+};
+
+static struct v4l2_m2m_ops m2m_ops = {
+	.device_run	= device_run,
+	.job_ready	= job_ready,
+	.job_abort	= job_abort,
+	.lock		= vpe_lock,
+	.unlock		= vpe_unlock,
+};
+
+static int vpe_runtime_get(struct platform_device *pdev)
+{
+	int r;
+
+	dev_dbg(&pdev->dev, "vpe_runtime_get\n");
+
+	r = pm_runtime_get_sync(&pdev->dev);
+	WARN_ON(r < 0);
+	return r < 0 ? r : 0;
+}
+
+static void vpe_runtime_put(struct platform_device *pdev)
+{
+
+	int r;
+
+	dev_dbg(&pdev->dev, "vpe_runtime_put\n");
+
+	r = pm_runtime_put_sync(&pdev->dev);
+	WARN_ON(r < 0 && r != -ENOSYS);
+}
+
+static int vpe_probe(struct platform_device *pdev)
+{
+	struct vpe_dev *dev;
+	struct video_device *vfd;
+	struct resource *res;
+	int ret, irq, func;
+
+	dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	spin_lock_init(&dev->lock);
+
+	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
+	if (ret)
+		return ret;
+
+	atomic_set(&dev->num_instances, 0);
+	mutex_init(&dev->dev_mutex);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "vpe_top");
+	/*
+	 * HACK: we get resource info from device tree in the form of a list of
+	 * VPE sub blocks, the driver currently uses only the base of vpe_top
+	 * for register access, the driver should be changed later to access
+	 * registers based on the sub block base addresses
+	 */
+	dev->base = devm_ioremap(&pdev->dev, res->start, SZ_32K);
+	if (IS_ERR(dev->base)) {
+		ret = PTR_ERR(dev->base);
+		goto v4l2_dev_unreg;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	ret = devm_request_irq(&pdev->dev, irq, vpe_irq, 0, VPE_MODULE_NAME,
+			dev);
+	if (ret)
+		goto v4l2_dev_unreg;
+
+	platform_set_drvdata(pdev, dev);
+
+	dev->alloc_ctx = vb2_dma_contig_init_ctx(&pdev->dev);
+	if (IS_ERR(dev->alloc_ctx)) {
+		vpe_err(dev, "Failed to alloc vb2 context\n");
+		ret = PTR_ERR(dev->alloc_ctx);
+		goto v4l2_dev_unreg;
+	}
+
+	dev->m2m_dev = v4l2_m2m_init(&m2m_ops);
+	if (IS_ERR(dev->m2m_dev)) {
+		vpe_err(dev, "Failed to init mem2mem device\n");
+		ret = PTR_ERR(dev->m2m_dev);
+		goto rel_ctx;
+	}
+
+	pm_runtime_enable(&pdev->dev);
+
+	ret = vpe_runtime_get(pdev);
+	if (ret)
+		goto rel_m2m;
+
+	/* Perform clk enable followed by reset */
+	vpe_set_clock_enable(dev, 1);
+
+	vpe_top_reset(dev);
+
+	func = read_field_reg(dev, VPE_PID, VPE_PID_FUNC_MASK,
+		VPE_PID_FUNC_SHIFT);
+	vpe_dbg(dev, "VPE PID function %x\n", func);
+
+	vpe_top_vpdma_reset(dev);
+
+	dev->vpdma = vpdma_create(pdev);
+	if (IS_ERR(dev->vpdma))
+		goto runtime_put;
+
+	vfd = &dev->vfd;
+	*vfd = vpe_videodev;
+	vfd->lock = &dev->dev_mutex;
+	vfd->v4l2_dev = &dev->v4l2_dev;
+
+	ret = video_register_device(vfd, VFL_TYPE_GRABBER, 0);
+	if (ret) {
+		vpe_err(dev, "Failed to register video device\n");
+		goto runtime_put;
+	}
+
+	video_set_drvdata(vfd, dev);
+	snprintf(vfd->name, sizeof(vfd->name), "%s", vpe_videodev.name);
+	dev_info(dev->v4l2_dev.dev, "Device registered as /dev/video%d\n",
+		vfd->num);
+
+	return 0;
+
+runtime_put:
+	vpe_runtime_put(pdev);
+rel_m2m:
+	pm_runtime_disable(&pdev->dev);
+	v4l2_m2m_release(dev->m2m_dev);
+rel_ctx:
+	vb2_dma_contig_cleanup_ctx(dev->alloc_ctx);
+v4l2_dev_unreg:
+	v4l2_device_unregister(&dev->v4l2_dev);
+
+	return ret;
+}
+
+static int vpe_remove(struct platform_device *pdev)
+{
+	struct vpe_dev *dev =
+		(struct vpe_dev *) platform_get_drvdata(pdev);
+
+	v4l2_info(&dev->v4l2_dev, "Removing " VPE_MODULE_NAME);
+
+	v4l2_m2m_release(dev->m2m_dev);
+	video_unregister_device(&dev->vfd);
+	v4l2_device_unregister(&dev->v4l2_dev);
+	vb2_dma_contig_cleanup_ctx(dev->alloc_ctx);
+
+	vpe_set_clock_enable(dev, 0);
+	vpe_runtime_put(pdev);
+	pm_runtime_disable(&pdev->dev);
+
+	return 0;
+}
+
+#if defined(CONFIG_OF)
+static const struct of_device_id vpe_of_match[] = {
+	{
+		.compatible = "ti,vpe",
+	},
+	{},
+};
+#else
+#define vpe_of_match NULL
+#endif
+
+static struct platform_driver vpe_pdrv = {
+	.probe		= vpe_probe,
+	.remove		= vpe_remove,
+	.driver		= {
+		.name	= VPE_MODULE_NAME,
+		.owner	= THIS_MODULE,
+		.of_match_table = vpe_of_match,
+	},
+};
+
+static void __exit vpe_exit(void)
+{
+	platform_driver_unregister(&vpe_pdrv);
+}
+
+static int __init vpe_init(void)
+{
+	return platform_driver_register(&vpe_pdrv);
+}
+
+module_init(vpe_init);
+module_exit(vpe_exit);
+
+MODULE_DESCRIPTION("TI VPE driver");
+MODULE_AUTHOR("Dale Farnsworth, <dale@farnsworth.org>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/media/platform/ti-vpe/vpe_regs.h b/drivers/media/platform/ti-vpe/vpe_regs.h
new file mode 100644
index 000000000000..ed214e828398
--- /dev/null
+++ b/drivers/media/platform/ti-vpe/vpe_regs.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2013 Texas Instruments Inc.
+ *
+ * David Griego, <dagriego@biglakesoftware.com>
+ * Dale Farnsworth, <dale@farnsworth.org>
+ * Archit Taneja, <archit@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef __TI_VPE_REGS_H
+#define __TI_VPE_REGS_H
+
+/* VPE register offsets and field selectors */
+
+/* VPE top level regs */
+#define VPE_PID				0x0000
+#define VPE_PID_MINOR_MASK		0x3f
+#define VPE_PID_MINOR_SHIFT		0
+#define VPE_PID_CUSTOM_MASK		0x03
+#define VPE_PID_CUSTOM_SHIFT		6
+#define VPE_PID_MAJOR_MASK		0x07
+#define VPE_PID_MAJOR_SHIFT		8
+#define VPE_PID_RTL_MASK		0x1f
+#define VPE_PID_RTL_SHIFT		11
+#define VPE_PID_FUNC_MASK		0xfff
+#define VPE_PID_FUNC_SHIFT		16
+#define VPE_PID_SCHEME_MASK		0x03
+#define VPE_PID_SCHEME_SHIFT		30
+
+#define VPE_SYSCONFIG			0x0010
+#define VPE_SYSCONFIG_IDLE_MASK		0x03
+#define VPE_SYSCONFIG_IDLE_SHIFT	2
+#define VPE_SYSCONFIG_STANDBY_MASK	0x03
+#define VPE_SYSCONFIG_STANDBY_SHIFT	4
+#define VPE_FORCE_IDLE_MODE		0
+#define VPE_NO_IDLE_MODE		1
+#define VPE_SMART_IDLE_MODE		2
+#define VPE_SMART_IDLE_WAKEUP_MODE	3
+#define VPE_FORCE_STANDBY_MODE		0
+#define VPE_NO_STANDBY_MODE		1
+#define VPE_SMART_STANDBY_MODE		2
+#define VPE_SMART_STANDBY_WAKEUP_MODE	3
+
+#define VPE_INT0_STATUS0_RAW_SET	0x0020
+#define VPE_INT0_STATUS0_RAW		VPE_INT0_STATUS0_RAW_SET
+#define VPE_INT0_STATUS0_CLR		0x0028
+#define VPE_INT0_STATUS0		VPE_INT0_STATUS0_CLR
+#define VPE_INT0_ENABLE0_SET		0x0030
+#define VPE_INT0_ENABLE0		VPE_INT0_ENABLE0_SET
+#define VPE_INT0_ENABLE0_CLR		0x0038
+#define VPE_INT0_LIST0_COMPLETE		(1 << 0)
+#define VPE_INT0_LIST0_NOTIFY		(1 << 1)
+#define VPE_INT0_LIST1_COMPLETE		(1 << 2)
+#define VPE_INT0_LIST1_NOTIFY		(1 << 3)
+#define VPE_INT0_LIST2_COMPLETE		(1 << 4)
+#define VPE_INT0_LIST2_NOTIFY		(1 << 5)
+#define VPE_INT0_LIST3_COMPLETE		(1 << 6)
+#define VPE_INT0_LIST3_NOTIFY		(1 << 7)
+#define VPE_INT0_LIST4_COMPLETE		(1 << 8)
+#define VPE_INT0_LIST4_NOTIFY		(1 << 9)
+#define VPE_INT0_LIST5_COMPLETE		(1 << 10)
+#define VPE_INT0_LIST5_NOTIFY		(1 << 11)
+#define VPE_INT0_LIST6_COMPLETE		(1 << 12)
+#define VPE_INT0_LIST6_NOTIFY		(1 << 13)
+#define VPE_INT0_LIST7_COMPLETE		(1 << 14)
+#define VPE_INT0_LIST7_NOTIFY		(1 << 15)
+#define VPE_INT0_DESCRIPTOR		(1 << 16)
+#define VPE_DEI_FMD_INT			(1 << 18)
+
+#define VPE_INT0_STATUS1_RAW_SET	0x0024
+#define VPE_INT0_STATUS1_RAW		VPE_INT0_STATUS1_RAW_SET
+#define VPE_INT0_STATUS1_CLR		0x002c
+#define VPE_INT0_STATUS1		VPE_INT0_STATUS1_CLR
+#define VPE_INT0_ENABLE1_SET		0x0034
+#define VPE_INT0_ENABLE1		VPE_INT0_ENABLE1_SET
+#define VPE_INT0_ENABLE1_CLR		0x003c
+#define VPE_INT0_CHANNEL_GROUP0		(1 << 0)
+#define VPE_INT0_CHANNEL_GROUP1		(1 << 1)
+#define VPE_INT0_CHANNEL_GROUP2		(1 << 2)
+#define VPE_INT0_CHANNEL_GROUP3		(1 << 3)
+#define VPE_INT0_CHANNEL_GROUP4		(1 << 4)
+#define VPE_INT0_CHANNEL_GROUP5		(1 << 5)
+#define VPE_INT0_CLIENT			(1 << 7)
+#define VPE_DEI_ERROR_INT		(1 << 16)
+#define VPE_DS1_UV_ERROR_INT		(1 << 22)
+
+#define VPE_INTC_EOI			0x00a0
+
+#define VPE_CLK_ENABLE			0x0100
+#define VPE_VPEDMA_CLK_ENABLE		(1 << 0)
+#define VPE_DATA_PATH_CLK_ENABLE	(1 << 1)
+
+#define VPE_CLK_RESET			0x0104
+#define VPE_VPDMA_CLK_RESET_MASK	0x1
+#define VPE_VPDMA_CLK_RESET_SHIFT	0
+#define VPE_DATA_PATH_CLK_RESET_MASK	0x1
+#define VPE_DATA_PATH_CLK_RESET_SHIFT	1
+#define VPE_MAIN_RESET_MASK		0x1
+#define VPE_MAIN_RESET_SHIFT		31
+
+#define VPE_CLK_FORMAT_SELECT		0x010c
+#define VPE_CSC_SRC_SELECT_MASK		0x03
+#define VPE_CSC_SRC_SELECT_SHIFT	0
+#define VPE_RGB_OUT_SELECT		(1 << 8)
+#define VPE_DS_SRC_SELECT_MASK		0x07
+#define VPE_DS_SRC_SELECT_SHIFT		9
+#define VPE_DS_BYPASS			(1 << 16)
+#define VPE_COLOR_SEPARATE_422		(1 << 18)
+
+#define VPE_DS_SRC_DEI_SCALER		(5 << VPE_DS_SRC_SELECT_SHIFT)
+#define VPE_CSC_SRC_DEI_SCALER		(3 << VPE_CSC_SRC_SELECT_SHIFT)
+
+#define VPE_CLK_RANGE_MAP		0x011c
+#define VPE_RANGE_RANGE_MAP_Y_MASK	0x07
+#define VPE_RANGE_RANGE_MAP_Y_SHIFT	0
+#define VPE_RANGE_RANGE_MAP_UV_MASK	0x07
+#define VPE_RANGE_RANGE_MAP_UV_SHIFT	3
+#define VPE_RANGE_MAP_ON		(1 << 6)
+#define VPE_RANGE_REDUCTION_ON		(1 << 28)
+
+/* VPE chrominance upsampler regs */
+#define VPE_US1_R0			0x0304
+#define VPE_US2_R0			0x0404
+#define VPE_US3_R0			0x0504
+#define VPE_US_C1_MASK			0x3fff
+#define VPE_US_C1_SHIFT			2
+#define VPE_US_C0_MASK			0x3fff
+#define VPE_US_C0_SHIFT			18
+#define VPE_US_MODE_MASK		0x03
+#define VPE_US_MODE_SHIFT		16
+#define VPE_ANCHOR_FID0_C1_MASK		0x3fff
+#define VPE_ANCHOR_FID0_C1_SHIFT	2
+#define VPE_ANCHOR_FID0_C0_MASK		0x3fff
+#define VPE_ANCHOR_FID0_C0_SHIFT	18
+
+#define VPE_US1_R1			0x0308
+#define VPE_US2_R1			0x0408
+#define VPE_US3_R1			0x0508
+#define VPE_ANCHOR_FID0_C3_MASK		0x3fff
+#define VPE_ANCHOR_FID0_C3_SHIFT	2
+#define VPE_ANCHOR_FID0_C2_MASK		0x3fff
+#define VPE_ANCHOR_FID0_C2_SHIFT	18
+
+#define VPE_US1_R2			0x030c
+#define VPE_US2_R2			0x040c
+#define VPE_US3_R2			0x050c
+#define VPE_INTERP_FID0_C1_MASK		0x3fff
+#define VPE_INTERP_FID0_C1_SHIFT	2
+#define VPE_INTERP_FID0_C0_MASK		0x3fff
+#define VPE_INTERP_FID0_C0_SHIFT	18
+
+#define VPE_US1_R3			0x0310
+#define VPE_US2_R3			0x0410
+#define VPE_US3_R3			0x0510
+#define VPE_INTERP_FID0_C3_MASK		0x3fff
+#define VPE_INTERP_FID0_C3_SHIFT	2
+#define VPE_INTERP_FID0_C2_MASK		0x3fff
+#define VPE_INTERP_FID0_C2_SHIFT	18
+
+#define VPE_US1_R4			0x0314
+#define VPE_US2_R4			0x0414
+#define VPE_US3_R4			0x0514
+#define VPE_ANCHOR_FID1_C1_MASK		0x3fff
+#define VPE_ANCHOR_FID1_C1_SHIFT	2
+#define VPE_ANCHOR_FID1_C0_MASK		0x3fff
+#define VPE_ANCHOR_FID1_C0_SHIFT	18
+
+#define VPE_US1_R5			0x0318
+#define VPE_US2_R5			0x0418
+#define VPE_US3_R5			0x0518
+#define VPE_ANCHOR_FID1_C3_MASK		0x3fff
+#define VPE_ANCHOR_FID1_C3_SHIFT	2
+#define VPE_ANCHOR_FID1_C2_MASK		0x3fff
+#define VPE_ANCHOR_FID1_C2_SHIFT	18
+
+#define VPE_US1_R6			0x031c
+#define VPE_US2_R6			0x041c
+#define VPE_US3_R6			0x051c
+#define VPE_INTERP_FID1_C1_MASK		0x3fff
+#define VPE_INTERP_FID1_C1_SHIFT	2
+#define VPE_INTERP_FID1_C0_MASK		0x3fff
+#define VPE_INTERP_FID1_C0_SHIFT	18
+
+#define VPE_US1_R7			0x0320
+#define VPE_US2_R7			0x0420
+#define VPE_US3_R7			0x0520
+#define VPE_INTERP_FID0_C3_MASK		0x3fff
+#define VPE_INTERP_FID0_C3_SHIFT	2
+#define VPE_INTERP_FID0_C2_MASK		0x3fff
+#define VPE_INTERP_FID0_C2_SHIFT	18
+
+/* VPE de-interlacer regs */
+#define VPE_DEI_FRAME_SIZE		0x0600
+#define VPE_DEI_WIDTH_MASK		0x07ff
+#define VPE_DEI_WIDTH_SHIFT		0
+#define VPE_DEI_HEIGHT_MASK		0x07ff
+#define VPE_DEI_HEIGHT_SHIFT		16
+#define VPE_DEI_INTERLACE_BYPASS	(1 << 29)
+#define VPE_DEI_FIELD_FLUSH		(1 << 30)
+#define VPE_DEI_PROGRESSIVE		(1 << 31)
+
+#define VPE_MDT_BYPASS			0x0604
+#define VPE_MDT_TEMPMAX_BYPASS		(1 << 0)
+#define VPE_MDT_SPATMAX_BYPASS		(1 << 1)
+
+#define VPE_MDT_SF_THRESHOLD		0x0608
+#define VPE_MDT_SF_SC_THR1_MASK		0xff
+#define VPE_MDT_SF_SC_THR1_SHIFT	0
+#define VPE_MDT_SF_SC_THR2_MASK		0xff
+#define VPE_MDT_SF_SC_THR2_SHIFT	0
+#define VPE_MDT_SF_SC_THR3_MASK		0xff
+#define VPE_MDT_SF_SC_THR3_SHIFT	0
+
+#define VPE_EDI_CONFIG			0x060c
+#define VPE_EDI_INP_MODE_MASK		0x03
+#define VPE_EDI_INP_MODE_SHIFT		0
+#define VPE_EDI_ENABLE_3D		(1 << 2)
+#define VPE_EDI_ENABLE_CHROMA_3D	(1 << 3)
+#define VPE_EDI_CHROMA3D_COR_THR_MASK	0xff
+#define VPE_EDI_CHROMA3D_COR_THR_SHIFT	8
+#define VPE_EDI_DIR_COR_LOWER_THR_MASK	0xff
+#define VPE_EDI_DIR_COR_LOWER_THR_SHIFT	16
+#define VPE_EDI_COR_SCALE_FACTOR_MASK	0xff
+#define VPE_EDI_COR_SCALE_FACTOR_SHIFT	23
+
+#define VPE_DEI_EDI_LUT_R0		0x0610
+#define VPE_EDI_LUT0_MASK		0x1f
+#define VPE_EDI_LUT0_SHIFT		0
+#define VPE_EDI_LUT1_MASK		0x1f
+#define VPE_EDI_LUT1_SHIFT		8
+#define VPE_EDI_LUT2_MASK		0x1f
+#define VPE_EDI_LUT2_SHIFT		16
+#define VPE_EDI_LUT3_MASK		0x1f
+#define VPE_EDI_LUT3_SHIFT		24
+
+#define VPE_DEI_EDI_LUT_R1		0x0614
+#define VPE_EDI_LUT0_MASK		0x1f
+#define VPE_EDI_LUT0_SHIFT		0
+#define VPE_EDI_LUT1_MASK		0x1f
+#define VPE_EDI_LUT1_SHIFT		8
+#define VPE_EDI_LUT2_MASK		0x1f
+#define VPE_EDI_LUT2_SHIFT		16
+#define VPE_EDI_LUT3_MASK		0x1f
+#define VPE_EDI_LUT3_SHIFT		24
+
+#define VPE_DEI_EDI_LUT_R2		0x0618
+#define VPE_EDI_LUT4_MASK		0x1f
+#define VPE_EDI_LUT4_SHIFT		0
+#define VPE_EDI_LUT5_MASK		0x1f
+#define VPE_EDI_LUT5_SHIFT		8
+#define VPE_EDI_LUT6_MASK		0x1f
+#define VPE_EDI_LUT6_SHIFT		16
+#define VPE_EDI_LUT7_MASK		0x1f
+#define VPE_EDI_LUT7_SHIFT		24
+
+#define VPE_DEI_EDI_LUT_R3		0x061c
+#define VPE_EDI_LUT8_MASK		0x1f
+#define VPE_EDI_LUT8_SHIFT		0
+#define VPE_EDI_LUT9_MASK		0x1f
+#define VPE_EDI_LUT9_SHIFT		8
+#define VPE_EDI_LUT10_MASK		0x1f
+#define VPE_EDI_LUT10_SHIFT		16
+#define VPE_EDI_LUT11_MASK		0x1f
+#define VPE_EDI_LUT11_SHIFT		24
+
+#define VPE_DEI_FMD_WINDOW_R0		0x0620
+#define VPE_FMD_WINDOW_MINX_MASK	0x07ff
+#define VPE_FMD_WINDOW_MINX_SHIFT	0
+#define VPE_FMD_WINDOW_MAXX_MASK	0x07ff
+#define VPE_FMD_WINDOW_MAXX_SHIFT	16
+#define VPE_FMD_WINDOW_ENABLE		(1 << 31)
+
+#define VPE_DEI_FMD_WINDOW_R1		0x0624
+#define VPE_FMD_WINDOW_MINY_MASK	0x07ff
+#define VPE_FMD_WINDOW_MINY_SHIFT	0
+#define VPE_FMD_WINDOW_MAXY_MASK	0x07ff
+#define VPE_FMD_WINDOW_MAXY_SHIFT	16
+
+#define VPE_DEI_FMD_CONTROL_R0		0x0628
+#define VPE_FMD_ENABLE			(1 << 0)
+#define VPE_FMD_LOCK			(1 << 1)
+#define VPE_FMD_JAM_DIR			(1 << 2)
+#define VPE_FMD_BED_ENABLE		(1 << 3)
+#define VPE_FMD_CAF_FIELD_THR_MASK	0xff
+#define VPE_FMD_CAF_FIELD_THR_SHIFT	16
+#define VPE_FMD_CAF_LINE_THR_MASK	0xff
+#define VPE_FMD_CAF_LINE_THR_SHIFT	24
+
+#define VPE_DEI_FMD_CONTROL_R1		0x062c
+#define VPE_FMD_CAF_THR_MASK		0x000fffff
+#define VPE_FMD_CAF_THR_SHIFT		0
+
+#define VPE_DEI_FMD_STATUS_R0		0x0630
+#define VPE_FMD_CAF_MASK		0x000fffff
+#define VPE_FMD_CAF_SHIFT		0
+#define VPE_FMD_RESET			(1 << 24)
+
+#define VPE_DEI_FMD_STATUS_R1		0x0634
+#define VPE_FMD_FIELD_DIFF_MASK		0x0fffffff
+#define VPE_FMD_FIELD_DIFF_SHIFT	0
+
+#define VPE_DEI_FMD_STATUS_R2		0x0638
+#define VPE_FMD_FRAME_DIFF_MASK		0x000fffff
+#define VPE_FMD_FRAME_DIFF_SHIFT	0
+
+/* VPE scaler regs */
+#define VPE_SC_MP_SC0			0x0700
+#define VPE_INTERLACE_O			(1 << 0)
+#define VPE_LINEAR			(1 << 1)
+#define VPE_SC_BYPASS			(1 << 2)
+#define VPE_INVT_FID			(1 << 3)
+#define VPE_USE_RAV			(1 << 4)
+#define VPE_ENABLE_EV			(1 << 5)
+#define VPE_AUTO_HS			(1 << 6)
+#define VPE_DCM_2X			(1 << 7)
+#define VPE_DCM_4X			(1 << 8)
+#define VPE_HP_BYPASS			(1 << 9)
+#define VPE_INTERLACE_I			(1 << 10)
+#define VPE_ENABLE_SIN2_VER_INTP	(1 << 11)
+#define VPE_Y_PK_EN			(1 << 14)
+#define VPE_TRIM			(1 << 15)
+#define VPE_SELFGEN_FID			(1 << 16)
+
+#define VPE_SC_MP_SC1			0x0704
+#define VPE_ROW_ACC_INC_MASK		0x07ffffff
+#define VPE_ROW_ACC_INC_SHIFT		0
+
+#define VPE_SC_MP_SC2			0x0708
+#define VPE_ROW_ACC_OFFSET_MASK		0x0fffffff
+#define VPE_ROW_ACC_OFFSET_SHIFT	0
+
+#define VPE_SC_MP_SC3			0x070c
+#define VPE_ROW_ACC_OFFSET_B_MASK	0x0fffffff
+#define VPE_ROW_ACC_OFFSET_B_SHIFT	0
+
+#define VPE_SC_MP_SC4			0x0710
+#define VPE_TAR_H_MASK			0x07ff
+#define VPE_TAR_H_SHIFT			0
+#define VPE_TAR_W_MASK			0x07ff
+#define VPE_TAR_W_SHIFT			12
+#define VPE_LIN_ACC_INC_U_MASK		0x07
+#define VPE_LIN_ACC_INC_U_SHIFT		24
+#define VPE_NLIN_ACC_INIT_U_MASK	0x07
+#define VPE_NLIN_ACC_INIT_U_SHIFT	28
+
+#define VPE_SC_MP_SC5			0x0714
+#define VPE_SRC_H_MASK			0x07ff
+#define VPE_SRC_H_SHIFT			0
+#define VPE_SRC_W_MASK			0x07ff
+#define VPE_SRC_W_SHIFT			12
+#define VPE_NLIN_ACC_INC_U_MASK		0x07
+#define VPE_NLIN_ACC_INC_U_SHIFT	24
+
+#define VPE_SC_MP_SC6			0x0718
+#define VPE_ROW_ACC_INIT_RAV_MASK	0x03ff
+#define VPE_ROW_ACC_INIT_RAV_SHIFT	0
+#define VPE_ROW_ACC_INIT_RAV_B_MASK	0x03ff
+#define VPE_ROW_ACC_INIT_RAV_B_SHIFT	10
+
+#define VPE_SC_MP_SC8			0x0720
+#define VPE_NLIN_LEFT_MASK		0x07ff
+#define VPE_NLIN_LEFT_SHIFT		0
+#define VPE_NLIN_RIGHT_MASK		0x07ff
+#define VPE_NLIN_RIGHT_SHIFT		12
+
+#define VPE_SC_MP_SC9			0x0724
+#define VPE_LIN_ACC_INC			VPE_SC_MP_SC9
+
+#define VPE_SC_MP_SC10			0x0728
+#define VPE_NLIN_ACC_INIT		VPE_SC_MP_SC10
+
+#define VPE_SC_MP_SC11			0x072c
+#define VPE_NLIN_ACC_INC		VPE_SC_MP_SC11
+
+#define VPE_SC_MP_SC12			0x0730
+#define VPE_COL_ACC_OFFSET_MASK		0x01ffffff
+#define VPE_COL_ACC_OFFSET_SHIFT	0
+
+#define VPE_SC_MP_SC13			0x0734
+#define VPE_SC_FACTOR_RAV_MASK		0x03ff
+#define VPE_SC_FACTOR_RAV_SHIFT		0
+#define VPE_CHROMA_INTP_THR_MASK	0x03ff
+#define VPE_CHROMA_INTP_THR_SHIFT	12
+#define VPE_DELTA_CHROMA_THR_MASK	0x0f
+#define VPE_DELTA_CHROMA_THR_SHIFT	24
+
+#define VPE_SC_MP_SC17			0x0744
+#define VPE_EV_THR_MASK			0x03ff
+#define VPE_EV_THR_SHIFT		12
+#define VPE_DELTA_LUMA_THR_MASK		0x0f
+#define VPE_DELTA_LUMA_THR_SHIFT	24
+#define VPE_DELTA_EV_THR_MASK		0x0f
+#define VPE_DELTA_EV_THR_SHIFT		28
+
+#define VPE_SC_MP_SC18			0x0748
+#define VPE_HS_FACTOR_MASK		0x03ff
+#define VPE_HS_FACTOR_SHIFT		0
+#define VPE_CONF_DEFAULT_MASK		0x01ff
+#define VPE_CONF_DEFAULT_SHIFT		16
+
+#define VPE_SC_MP_SC19			0x074c
+#define VPE_HPF_COEFF0_MASK		0xff
+#define VPE_HPF_COEFF0_SHIFT		0
+#define VPE_HPF_COEFF1_MASK		0xff
+#define VPE_HPF_COEFF1_SHIFT		8
+#define VPE_HPF_COEFF2_MASK		0xff
+#define VPE_HPF_COEFF2_SHIFT		16
+#define VPE_HPF_COEFF3_MASK		0xff
+#define VPE_HPF_COEFF3_SHIFT		23
+
+#define VPE_SC_MP_SC20			0x0750
+#define VPE_HPF_COEFF4_MASK		0xff
+#define VPE_HPF_COEFF4_SHIFT		0
+#define VPE_HPF_COEFF5_MASK		0xff
+#define VPE_HPF_COEFF5_SHIFT		8
+#define VPE_HPF_NORM_SHIFT_MASK		0x07
+#define VPE_HPF_NORM_SHIFT_SHIFT	16
+#define VPE_NL_LIMIT_MASK		0x1ff
+#define VPE_NL_LIMIT_SHIFT		20
+
+#define VPE_SC_MP_SC21			0x0754
+#define VPE_NL_LO_THR_MASK		0x01ff
+#define VPE_NL_LO_THR_SHIFT		0
+#define VPE_NL_LO_SLOPE_MASK		0xff
+#define VPE_NL_LO_SLOPE_SHIFT		16
+
+#define VPE_SC_MP_SC22			0x0758
+#define VPE_NL_HI_THR_MASK		0x01ff
+#define VPE_NL_HI_THR_SHIFT		0
+#define VPE_NL_HI_SLOPE_SH_MASK		0x07
+#define VPE_NL_HI_SLOPE_SH_SHIFT	16
+
+#define VPE_SC_MP_SC23			0x075c
+#define VPE_GRADIENT_THR_MASK		0x07ff
+#define VPE_GRADIENT_THR_SHIFT		0
+#define VPE_GRADIENT_THR_RANGE_MASK	0x0f
+#define VPE_GRADIENT_THR_RANGE_SHIFT	12
+#define VPE_MIN_GY_THR_MASK		0xff
+#define VPE_MIN_GY_THR_SHIFT		16
+#define VPE_MIN_GY_THR_RANGE_MASK	0x0f
+#define VPE_MIN_GY_THR_RANGE_SHIFT	28
+
+#define VPE_SC_MP_SC24			0x0760
+#define VPE_ORG_H_MASK			0x07ff
+#define VPE_ORG_H_SHIFT			0
+#define VPE_ORG_W_MASK			0x07ff
+#define VPE_ORG_W_SHIFT			16
+
+#define VPE_SC_MP_SC25			0x0764
+#define VPE_OFF_H_MASK			0x07ff
+#define VPE_OFF_H_SHIFT			0
+#define VPE_OFF_W_MASK			0x07ff
+#define VPE_OFF_W_SHIFT			16
+
+/* VPE color space converter regs */
+#define VPE_CSC_CSC00			0x5700
+#define VPE_CSC_A0_MASK			0x1fff
+#define VPE_CSC_A0_SHIFT		0
+#define VPE_CSC_B0_MASK			0x1fff
+#define VPE_CSC_B0_SHIFT		16
+
+#define VPE_CSC_CSC01			0x5704
+#define VPE_CSC_C0_MASK			0x1fff
+#define VPE_CSC_C0_SHIFT		0
+#define VPE_CSC_A1_MASK			0x1fff
+#define VPE_CSC_A1_SHIFT		16
+
+#define VPE_CSC_CSC02			0x5708
+#define VPE_CSC_B1_MASK			0x1fff
+#define VPE_CSC_B1_SHIFT		0
+#define VPE_CSC_C1_MASK			0x1fff
+#define VPE_CSC_C1_SHIFT		16
+
+#define VPE_CSC_CSC03			0x570c
+#define VPE_CSC_A2_MASK			0x1fff
+#define VPE_CSC_A2_SHIFT		0
+#define VPE_CSC_B2_MASK			0x1fff
+#define VPE_CSC_B2_SHIFT		16
+
+#define VPE_CSC_CSC04			0x5710
+#define VPE_CSC_C2_MASK			0x1fff
+#define VPE_CSC_C2_SHIFT		0
+#define VPE_CSC_D0_MASK			0x0fff
+#define VPE_CSC_D0_SHIFT		16
+
+#define VPE_CSC_CSC05			0x5714
+#define VPE_CSC_D1_MASK			0x0fff
+#define VPE_CSC_D1_SHIFT		0
+#define VPE_CSC_D2_MASK			0x0fff
+#define VPE_CSC_D2_SHIFT		16
+#define VPE_CSC_BYPASS			(1 << 28)
+
+#endif
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 083bb5a5aae2..1666aabbbb86 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -160,6 +160,10 @@ enum v4l2_colorfx {
  * of controls. Total of 16 controls is reserved for this driver */
 #define V4L2_CID_USER_SI476X_BASE		(V4L2_CID_USER_BASE + 0x1040)
 
+/* The base for the TI VPE driver controls. Total of 16 controls is reserved for
+ * this driver */
+#define V4L2_CID_USER_TI_VPE_BASE		(V4L2_CID_USER_BASE + 0x1050)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From 5837f6dfcb00f764976ddc178933e612702cbf54 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:18 -0400
Subject: NFS: stop using NFS_MOUNT_SECFLAVOUR server flag

Since the parsed sec= flavor is now stored in nfs_server->auth_info,
we no longer need an nfs_server flag to determine if a sec= option was
used.

This flag has not been completely removed because it is still needed for
the (old but still supported) non-text parsed mount options ABI
compatability.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4client.c            | 6 ++++--
 fs/nfs/nfs4namespace.c         | 2 +-
 fs/nfs/nfs4proc.c              | 4 ++--
 fs/nfs/super.c                 | 3 +--
 include/uapi/linux/nfs_mount.h | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 04131c837f27..f6cc77c7d802 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1051,6 +1051,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
+	bool auth_probe;
 	int error;
 
 	dprintk("--> nfs4_create_referral_server()\n");
@@ -1083,8 +1084,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
-	error = nfs4_server_common_setup(server, mntfh,
-			!(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
+	auth_probe = parent_server->auth_info.flavor_len < 1;
+
+	error = nfs4_server_common_setup(server, mntfh, auth_probe);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4ec0eea68fdf..b947054f0ca1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -390,7 +390,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
+	else if (server->auth_info.flavor_len == 0) {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1463c71b0862..0eb8dc5792da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2920,7 +2920,7 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
 		/* Did user force a 'sec=' mount option? */
-		if (server->flags & NFS_MOUNT_SECFLAVOUR)
+		if (server->auth_info.flavor_len > 0)
 			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
@@ -3180,7 +3180,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			if (client != *clnt)
 				goto out;
 			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
+			if (NFS_SERVER(dir)->auth_info.flavor_len > 0)
 				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de1e5e89c93d..3a4f8bf5e5a5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1067,7 +1067,6 @@ static int nfs_parse_security_flavors(char *value,
 		return 0;
 	}
 
-	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 	mnt->auth_info.flavors[0] = pseudoflavor;
 	mnt->auth_info.flavor_len = 1;
 	return 1;
@@ -2332,7 +2331,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+	if (b->auth_info.flavor_len > 0 &&
 	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h
index 576bddd72e04..64b0f22f5c4c 100644
--- a/include/uapi/linux/nfs_mount.h
+++ b/include/uapi/linux/nfs_mount.h
@@ -60,7 +60,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_BROKEN_SUID	0x0400	/* 4 */
 #define NFS_MOUNT_NOACL		0x0800	/* 4 */
 #define NFS_MOUNT_STRICTLOCK	0x1000	/* reserved for NFSv4 */
-#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 */
+#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 non-text parsed mount data only */
 #define NFS_MOUNT_NORDIRPLUS	0x4000	/* 5 */
 #define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
-- 
cgit v1.2.3


From 7d1d65cb84e1cfacba3f54c5934194785259e0d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 28 Oct 2013 16:43:02 +0100
Subject: net: sched: cls_bpf: add BPF-based classifier

This work contains a lightweight BPF-based traffic classifier that can
serve as a flexible alternative to ematch-based tree classification, i.e.
now that BPF filter engine can also be JITed in the kernel. Naturally, tc
actions and policies are supported as well with cls_bpf. Multiple BPF
programs/filter can be attached for a class, or they can just as well be
written within a single BPF program, that's really up to the user how he
wishes to run/optimize the code, e.g. also for inversion of verdicts etc.
The notion of a BPF program's return/exit codes is being kept as follows:

     0: No match
    -1: Select classid given in "tc filter ..." command
  else: flowid, overwrite the default one

As a minimal usage example with iproute2, we use a 3 band prio root qdisc
on a router with sfq each as leave, and assign ssh and icmp bpf-based
filters to band 1, http traffic to band 2 and the rest to band 3. For the
first two bands we load the bytecode from a file, in the 2nd we load it
inline as an example:

echo 1 > /proc/sys/net/core/bpf_jit_enable

tc qdisc del dev em1 root
tc qdisc add dev em1 root handle 1: prio bands 3 priomap 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

tc qdisc add dev em1 parent 1:1 sfq perturb 16
tc qdisc add dev em1 parent 1:2 sfq perturb 16
tc qdisc add dev em1 parent 1:3 sfq perturb 16

tc filter add dev em1 parent 1: bpf run bytecode-file /etc/tc/ssh.bpf flowid 1:1
tc filter add dev em1 parent 1: bpf run bytecode-file /etc/tc/icmp.bpf flowid 1:1
tc filter add dev em1 parent 1: bpf run bytecode-file /etc/tc/http.bpf flowid 1:2
tc filter add dev em1 parent 1: bpf run bytecode "`bpfc -f tc -i misc.ops`" flowid 1:3

BPF programs can be easily created and passed to tc, either as inline
'bytecode' or 'bytecode-file'. There are a couple of front-ends that can
compile opcodes, for example:

1) People familiar with tcpdump-like filters:

   tcpdump -iem1 -ddd port 22 | tr '\n' ',' > /etc/tc/ssh.bpf

2) People that want to low-level program their filters or use BPF
   extensions that lack support by libpcap's compiler:

   bpfc -f tc -i ssh.ops > /etc/tc/ssh.bpf

   ssh.ops example code:
   ldh [12]
   jne #0x800, drop
   ldb [23]
   jneq #6, drop
   ldh [20]
   jset #0x1fff, drop
   ldxb 4 * ([14] & 0xf)
   ldh [%x + 14]
   jeq #0x16, pass
   ldh [%x + 16]
   jne #0x16, drop
   pass: ret #-1
   drop: ret #0

It was chosen to load bytecode into tc, since the reverse operation,
tc filter list dev em1, is then able to show the exact commands again.
Possible follow-up work could also include a small expression compiler
for iproute2. Tested with the help of bmon. This idea came up during
the Netfilter Workshop 2013 in Copenhagen. Also thanks to feedback from
Eric Dumazet!

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  14 ++
 net/sched/Kconfig            |  10 ++
 net/sched/Makefile           |   1 +
 net/sched/cls_bpf.c          | 385 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 410 insertions(+)
 create mode 100644 net/sched/cls_bpf.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 082eafaf026b..25731dfb3fcc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -388,6 +388,20 @@ enum {
 
 #define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1)
 
+/* BPF classifier */
+
+enum {
+	TCA_BPF_UNSPEC,
+	TCA_BPF_ACT,
+	TCA_BPF_POLICE,
+	TCA_BPF_CLASSID,
+	TCA_BPF_OPS_LEN,
+	TCA_BPF_OPS,
+	__TCA_BPF_MAX,
+};
+
+#define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
+
 /* Extended Matches */
 
 struct tcf_ematch_tree_hdr {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index c03a32a0418e..ad1f1d819203 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -443,6 +443,16 @@ config NET_CLS_CGROUP
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_cgroup.
 
+config NET_CLS_BPF
+	tristate "BPF-based classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called cls_bpf.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index e5f9abe9a5db..35fa47a494ab 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
 obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 000000000000..1002a8226281
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,385 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+struct cls_bpf_head {
+	struct list_head plist;
+	u32 hgen;
+};
+
+struct cls_bpf_prog {
+	struct sk_filter *filter;
+	struct sock_filter *bpf_ops;
+	struct tcf_exts exts;
+	struct tcf_result res;
+	struct list_head link;
+	u32 handle;
+	u16 bpf_len;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
+	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static const struct tcf_ext_map bpf_ext_map = {
+	.action = TCA_BPF_ACT,
+	.police = TCA_BPF_POLICE,
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			    struct tcf_result *res)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	int ret;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		int filter_res = SK_RUN_FILTER(prog->filter, skb);
+
+		if (filter_res == 0)
+			continue;
+
+		*res = prog->res;
+		if (filter_res != -1)
+			res->classid = filter_res;
+
+		ret = tcf_exts_exec(skb, &prog->exts, res);
+		if (ret < 0)
+			continue;
+
+		return ret;
+	}
+
+	return -1;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+
+	INIT_LIST_HEAD(&head->plist);
+	tp->root = head;
+
+	return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+	tcf_unbind_filter(tp, &prog->res);
+	tcf_exts_destroy(tp, &prog->exts);
+
+	sk_unattached_filter_destroy(prog->filter);
+
+	kfree(prog->bpf_ops);
+	kfree(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog == todel) {
+			tcf_tree_lock(tp);
+			list_del(&prog->link);
+			tcf_tree_unlock(tp);
+
+			cls_bpf_delete_prog(tp, prog);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *tmp;
+
+	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+		list_del(&prog->link);
+		cls_bpf_delete_prog(tp, prog);
+	}
+
+	kfree(head);
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	unsigned long ret = 0UL;
+
+	if (head == NULL)
+		return 0UL;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog->handle == handle) {
+			ret = (unsigned long) prog;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+				   struct cls_bpf_prog *prog,
+				   unsigned long base, struct nlattr **tb,
+				   struct nlattr *est)
+{
+	struct sock_filter *bpf_ops, *bpf_old;
+	struct tcf_exts exts;
+	struct sock_fprog tmp;
+	struct sk_filter *fp, *fp_old;
+	u16 bpf_size, bpf_len;
+	u32 classid;
+	int ret;
+
+	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
+		return -EINVAL;
+
+	ret = tcf_exts_validate(net, tp, tb, est, &exts, &bpf_ext_map);
+	if (ret < 0)
+		return ret;
+
+	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	bpf_size = bpf_len * sizeof(*bpf_ops);
+	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+	if (bpf_ops == NULL) {
+		ret = -ENOMEM;
+		goto errout;
+	}
+
+	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+	tmp.len = bpf_len;
+	tmp.filter = (struct sock_filter __user *) bpf_ops;
+
+	ret = sk_unattached_filter_create(&fp, &tmp);
+	if (ret)
+		goto errout_free;
+
+	tcf_tree_lock(tp);
+	fp_old = prog->filter;
+	bpf_old = prog->bpf_ops;
+
+	prog->bpf_len = bpf_len;
+	prog->bpf_ops = bpf_ops;
+	prog->filter = fp;
+	prog->res.classid = classid;
+	tcf_tree_unlock(tp);
+
+	tcf_bind_filter(tp, &prog->res, base);
+	tcf_exts_change(tp, &prog->exts, &exts);
+
+	if (fp_old)
+		sk_unattached_filter_destroy(fp_old);
+	if (bpf_old)
+		kfree(bpf_old);
+
+	return 0;
+
+errout_free:
+	kfree(bpf_ops);
+errout:
+	tcf_exts_destroy(tp, &exts);
+	return ret;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+				   struct cls_bpf_head *head)
+{
+	unsigned int i = 0x80000000;
+
+	do {
+		if (++head->hgen == 0x7FFFFFFF)
+			head->hgen = 1;
+	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
+	if (i == 0)
+		pr_err("Insufficient number of handles\n");
+
+	return i;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+			  struct tcf_proto *tp, unsigned long base,
+			  u32 handle, struct nlattr **tca,
+			  unsigned long *arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+	struct nlattr *tb[TCA_BPF_MAX + 1];
+	int ret;
+
+	if (tca[TCA_OPTIONS] == NULL)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+	if (ret < 0)
+		return ret;
+
+	if (prog != NULL) {
+		if (handle && prog->handle != handle)
+			return -EINVAL;
+		return cls_bpf_modify_existing(net, tp, prog, base, tb,
+					       tca[TCA_RATE]);
+	}
+
+	prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+	if (prog == NULL)
+		return -ENOBUFS;
+
+	if (handle == 0)
+		prog->handle = cls_bpf_grab_new_handle(tp, head);
+	else
+		prog->handle = handle;
+	if (prog->handle == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE]);
+	if (ret < 0)
+		goto errout;
+
+	tcf_tree_lock(tp);
+	list_add(&prog->link, &head->plist);
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long) prog;
+
+	return 0;
+errout:
+	if (*arg == 0UL && prog)
+		kfree(prog);
+
+	return ret;
+}
+
+static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh,
+			struct sk_buff *skb, struct tcmsg *tm)
+{
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+	struct nlattr *nest, *nla;
+
+	if (prog == NULL)
+		return skb->len;
+
+	tm->tcm_handle = prog->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+			  sizeof(struct sock_filter));
+	if (nla == NULL)
+		goto nla_put_failure;
+
+        memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+	if (tcf_exts_dump(skb, &prog->exts, &bpf_ext_map) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &prog->exts, &bpf_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+	.kind		=	"bpf",
+	.owner		=	THIS_MODULE,
+	.classify	=	cls_bpf_classify,
+	.init		=	cls_bpf_init,
+	.destroy	=	cls_bpf_destroy,
+	.get		=	cls_bpf_get,
+	.put		=	cls_bpf_put,
+	.change		=	cls_bpf_change,
+	.delete		=	cls_bpf_delete,
+	.walk		=	cls_bpf_walk,
+	.dump		=	cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+	return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+	unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
-- 
cgit v1.2.3


From 9c15bb1d0a8411f9bb3395d21d5309bde7da0c1c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 22 Sep 2013 16:44:50 +0200
Subject: kvm: Add KVM_GET_EMULATED_CPUID

Add a kvm ioctl which states which system functionality kvm emulates.
The format used is that of CPUID and we return the corresponding CPUID
bits set for which we do emulate functionality.

Make sure ->padding is being passed on clean from userspace so that we
can use it for something in the future, after the ioctl gets cast in
stone.

s/kvm_dev_ioctl_get_supported_cpuid/kvm_dev_ioctl_get_cpuid/ while at
it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 77 +++++++++++++++++++++++++++++++++++++--
 arch/x86/include/uapi/asm/kvm.h   |  6 +--
 arch/x86/kvm/cpuid.c              | 57 ++++++++++++++++++++++++++---
 arch/x86/kvm/cpuid.h              |  5 ++-
 arch/x86/kvm/x86.c                |  9 +++--
 include/uapi/linux/kvm.h          |  2 +
 6 files changed, 139 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a89a5ee0b940..488964414f04 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1122,9 +1122,9 @@ struct kvm_cpuid2 {
 	struct kvm_cpuid_entry2 entries[0];
 };
 
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX		BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC		BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT		BIT(2)
 
 struct kvm_cpuid_entry2 {
 	__u32 function;
@@ -2684,6 +2684,77 @@ and usually define the validity of a groups of registers. (e.g. one bit
 };
 
 
+4.81 KVM_GET_EMULATED_CPUID
+
+Capability: KVM_CAP_EXT_EMUL_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 flags;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+The member 'flags' is used for passing flags from userspace.
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX		BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC		BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT		BIT(2)
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are emulated by
+kvm.Userspace can use the information returned by this ioctl to query
+which features are emulated by kvm instead of being present natively.
+
+Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2
+structure with the 'nent' field indicating the number of entries in
+the variable-size array 'entries'. If the number of entries is too low
+to describe the cpu capabilities, an error (E2BIG) is returned. If the
+number is too high, the 'nent' field is adjusted and an error (ENOMEM)
+is returned. If the number is just right, the 'nent' field is adjusted
+to the number of valid entries in the 'entries' array, which is then
+filled.
+
+The entries returned are the set CPUID bits of the respective features
+which kvm emulates, as returned by the CPUID instruction, with unknown
+or unsupported feature bits cleared.
+
+Features like x2apic, for example, may not be present in the host cpu
+but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be
+emulated efficiently and thus not included here.
+
+The fields in each entry are defined as follows:
+
+  function: the eax value used to obtain the entry
+  index: the ecx value used to obtain the entry (for entries that are
+         affected by ecx)
+  flags: an OR of zero or more of the following:
+        KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+           if the index field is valid
+        KVM_CPUID_FLAG_STATEFUL_FUNC:
+           if cpuid for this function returns different values for successive
+           invocations; there will be several entries with the same function,
+           all with this flag set
+        KVM_CPUID_FLAG_STATE_READ_NEXT:
+           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+           the first entry to be read by a cpu
+   eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+         this function/index combination
+
+
 6. Capabilities that can be enabled
 -----------------------------------
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5d9a3033b3d7..d3a87780c70b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 {
 	__u32 padding[3];
 };
 
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX		BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC		BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT		BIT(2)
 
 /* for KVM_SET_CPUID2 */
 struct kvm_cpuid2 {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0a1e3b8b964d..78a4439bfdc5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -219,8 +219,14 @@ static bool supported_xcr0_bit(unsigned bit)
 
 #define F(x) bit(X86_FEATURE_##x)
 
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			 u32 index, int *nent, int maxnent)
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
+				   u32 func, u32 index, int *nent, int maxnent)
+{
+	return 0;
+}
+
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+				 u32 index, int *nent, int maxnent)
 {
 	int r;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -515,6 +521,15 @@ out:
 	return r;
 }
 
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+			u32 idx, int *nent, int maxnent, unsigned int type)
+{
+	if (type == KVM_GET_EMULATED_CPUID)
+		return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+
+	return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
+
 #undef F
 
 struct kvm_cpuid_param {
@@ -529,8 +544,34 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
 	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
 
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				      struct kvm_cpuid_entry2 __user *entries)
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+				 __u32 num_entries, unsigned int ioctl_type)
+{
+	int i;
+
+	if (ioctl_type != KVM_GET_EMULATED_CPUID)
+		return false;
+
+	/*
+	 * We want to make sure that ->padding is being passed clean from
+	 * userspace in case we want to use it for something in the future.
+	 *
+	 * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+	 * have to give ourselves satisfied only with the emulated side. /me
+	 * sheds a tear.
+	 */
+	for (i = 0; i < num_entries; i++) {
+		if (entries[i].padding[0] ||
+		    entries[i].padding[1] ||
+		    entries[i].padding[2])
+			return true;
+	}
+	return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+			    struct kvm_cpuid_entry2 __user *entries,
+			    unsigned int type)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = -E2BIG, i;
@@ -547,6 +588,10 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 		goto out;
 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+	if (sanity_check_entries(entries, cpuid->nent, type))
+		return -EINVAL;
+
 	r = -ENOMEM;
 	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
 	if (!cpuid_entries)
@@ -560,7 +605,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 			continue;
 
 		r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
-				&nent, cpuid->nent);
+				&nent, cpuid->nent, type);
 
 		if (r)
 			goto out_free;
@@ -571,7 +616,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 		limit = cpuid_entries[nent - 1].eax;
 		for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
 			r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
-				     &nent, cpuid->nent);
+				     &nent, cpuid->nent, type);
 
 		if (r)
 			goto out_free;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b7fd07984888..f1e4895174b2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,8 +6,9 @@
 void kvm_update_cpuid(struct kvm_vcpu *vcpu);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 					      u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				      struct kvm_cpuid_entry2 __user *entries);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+			    struct kvm_cpuid_entry2 __user *entries,
+			    unsigned int type);
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 			     struct kvm_cpuid *cpuid,
 			     struct kvm_cpuid_entry __user *entries);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index edf2a07df3a3..3d1e3ee8b39e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2564,6 +2564,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_EXT_EMUL_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
@@ -2673,15 +2674,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_GET_SUPPORTED_CPUID: {
+	case KVM_GET_SUPPORTED_CPUID:
+	case KVM_GET_EMULATED_CPUID: {
 		struct kvm_cpuid2 __user *cpuid_arg = argp;
 		struct kvm_cpuid2 cpuid;
 
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
-		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-						      cpuid_arg->entries);
+
+		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+					    ioctl);
 		if (r)
 			goto out;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e32e776f20c0..32c60b98ddf8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -541,6 +541,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
 #define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
 #define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+#define KVM_GET_EMULATED_CPUID	  _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
 
 /*
  * Extension capability list.
@@ -668,6 +669,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
 #define KVM_CAP_SPAPR_MULTITCE 94
+#define KVM_CAP_EXT_EMUL_CPUID 95
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From ec53500fae421e07c5d035918ca454a429732ef4 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 30 Oct 2013 11:02:17 -0600
Subject: kvm: Add VFIO device

So far we've succeeded at making KVM and VFIO mostly unaware of each
other, but areas are cropping up where a connection beyond eventfds
and irqfds needs to be made.  This patch introduces a KVM-VFIO device
that is meant to be a gateway for such interaction.  The user creates
the device and can add and remove VFIO groups to it via file
descriptors.  When a group is added, KVM verifies the group is valid
and gets a reference to it via the VFIO external user interface.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/devices/vfio.txt |  22 +++
 arch/x86/kvm/Kconfig                       |   1 +
 arch/x86/kvm/Makefile                      |   2 +-
 include/linux/kvm_host.h                   |   1 +
 include/uapi/linux/kvm.h                   |   4 +
 virt/kvm/Kconfig                           |   3 +
 virt/kvm/kvm_main.c                        |   5 +
 virt/kvm/vfio.c                            | 220 +++++++++++++++++++++++++++++
 8 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/virtual/kvm/devices/vfio.txt
 create mode 100644 virt/kvm/vfio.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
new file mode 100644
index 000000000000..ef51740c67ca
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -0,0 +1,22 @@
+VFIO virtual device
+===================
+
+Device types supported:
+  KVM_DEV_TYPE_VFIO
+
+Only one VFIO instance may be created per VM.  The created device
+tracks VFIO groups in use by the VM and features of those groups
+important to the correctness and acceleration of the VM.  As groups
+are enabled and disabled for use by the VM, KVM should be updated
+about their presence.  When registered with KVM, a reference to the
+VFIO-group is held by KVM.
+
+Groups:
+  KVM_DEV_VFIO_GROUP
+
+KVM_DEV_VFIO_GROUP attributes:
+  KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+  KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+
+For each, kvm_device_attr.addr points to an int32_t file descriptor
+for the VFIO group.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a47a3e54b964..b89c5db2b832 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -38,6 +38,7 @@ config KVM
 	select PERF_EVENTS
 	select HAVE_KVM_MSI
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
+	select KVM_VFIO
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index bf4fb04d0112..25d22b2d6509 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,7 @@ KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/ioapic.o \
 				$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c9d4236ab442..7beddbd38ac7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1058,6 +1058,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_vfio_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 32c60b98ddf8..509cfbfe9658 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -845,6 +845,10 @@ struct kvm_device_attr {
 #define KVM_DEV_TYPE_FSL_MPIC_20	1
 #define KVM_DEV_TYPE_FSL_MPIC_42	2
 #define KVM_DEV_TYPE_XICS		3
+#define KVM_DEV_TYPE_VFIO		4
+#define  KVM_DEV_VFIO_GROUP			1
+#define   KVM_DEV_VFIO_GROUP_ADD			1
+#define   KVM_DEV_VFIO_GROUP_DEL			2
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 779262f59e25..fbe1a48bd629 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -27,3 +27,6 @@ config HAVE_KVM_MSI
 
 config HAVE_KVM_CPU_RELAX_INTERCEPT
        bool
+
+config KVM_VFIO
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ca014da134d..82c4047aa0e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2270,6 +2270,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	case KVM_DEV_TYPE_XICS:
 		ops = &kvm_xics_ops;
 		break;
+#endif
+#ifdef CONFIG_KVM_VFIO
+	case KVM_DEV_TYPE_VFIO:
+		ops = &kvm_vfio_ops;
+		break;
 #endif
 	default:
 		return -ENODEV;
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
new file mode 100644
index 000000000000..597c258245ea
--- /dev/null
+++ b/virt/kvm/vfio.c
@@ -0,0 +1,220 @@
+/*
+ * VFIO-KVM bridge pseudo device
+ *
+ * Copyright (C) 2013 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+struct kvm_vfio_group {
+	struct list_head node;
+	struct vfio_group *vfio_group;
+};
+
+struct kvm_vfio {
+	struct list_head group_list;
+	struct mutex lock;
+};
+
+static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
+{
+	struct vfio_group *vfio_group;
+	struct vfio_group *(*fn)(struct file *);
+
+	fn = symbol_get(vfio_group_get_external_user);
+	if (!fn)
+		return ERR_PTR(-EINVAL);
+
+	vfio_group = fn(filep);
+
+	symbol_put(vfio_group_get_external_user);
+
+	return vfio_group;
+}
+
+static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
+{
+	void (*fn)(struct vfio_group *);
+
+	fn = symbol_get(vfio_group_put_external_user);
+	if (!fn)
+		return;
+
+	fn(vfio_group);
+
+	symbol_put(vfio_group_put_external_user);
+}
+
+static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
+{
+	struct kvm_vfio *kv = dev->private;
+	struct vfio_group *vfio_group;
+	struct kvm_vfio_group *kvg;
+	void __user *argp = (void __user *)arg;
+	struct fd f;
+	int32_t fd;
+	int ret;
+
+	switch (attr) {
+	case KVM_DEV_VFIO_GROUP_ADD:
+		if (get_user(fd, (int32_t __user *)argp))
+			return -EFAULT;
+
+		f = fdget(fd);
+		if (!f.file)
+			return -EBADF;
+
+		vfio_group = kvm_vfio_group_get_external_user(f.file);
+		fdput(f);
+
+		if (IS_ERR(vfio_group))
+			return PTR_ERR(vfio_group);
+
+		mutex_lock(&kv->lock);
+
+		list_for_each_entry(kvg, &kv->group_list, node) {
+			if (kvg->vfio_group == vfio_group) {
+				mutex_unlock(&kv->lock);
+				kvm_vfio_group_put_external_user(vfio_group);
+				return -EEXIST;
+			}
+		}
+
+		kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
+		if (!kvg) {
+			mutex_unlock(&kv->lock);
+			kvm_vfio_group_put_external_user(vfio_group);
+			return -ENOMEM;
+		}
+
+		list_add_tail(&kvg->node, &kv->group_list);
+		kvg->vfio_group = vfio_group;
+
+		mutex_unlock(&kv->lock);
+
+		return 0;
+
+	case KVM_DEV_VFIO_GROUP_DEL:
+		if (get_user(fd, (int32_t __user *)argp))
+			return -EFAULT;
+
+		f = fdget(fd);
+		if (!f.file)
+			return -EBADF;
+
+		vfio_group = kvm_vfio_group_get_external_user(f.file);
+		fdput(f);
+
+		if (IS_ERR(vfio_group))
+			return PTR_ERR(vfio_group);
+
+		ret = -ENOENT;
+
+		mutex_lock(&kv->lock);
+
+		list_for_each_entry(kvg, &kv->group_list, node) {
+			if (kvg->vfio_group != vfio_group)
+				continue;
+
+			list_del(&kvg->node);
+			kvm_vfio_group_put_external_user(kvg->vfio_group);
+			kfree(kvg);
+			ret = 0;
+			break;
+		}
+
+		mutex_unlock(&kv->lock);
+
+		kvm_vfio_group_put_external_user(vfio_group);
+
+		return ret;
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_vfio_set_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_VFIO_GROUP:
+		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_vfio_has_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_VFIO_GROUP:
+		switch (attr->attr) {
+		case KVM_DEV_VFIO_GROUP_ADD:
+		case KVM_DEV_VFIO_GROUP_DEL:
+			return 0;
+		}
+
+		break;
+	}
+
+	return -ENXIO;
+}
+
+static void kvm_vfio_destroy(struct kvm_device *dev)
+{
+	struct kvm_vfio *kv = dev->private;
+	struct kvm_vfio_group *kvg, *tmp;
+
+	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+		kvm_vfio_group_put_external_user(kvg->vfio_group);
+		list_del(&kvg->node);
+		kfree(kvg);
+	}
+
+	kfree(kv);
+	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int kvm_vfio_create(struct kvm_device *dev, u32 type)
+{
+	struct kvm_device *tmp;
+	struct kvm_vfio *kv;
+
+	/* Only one VFIO "device" per VM */
+	list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
+		if (tmp->ops == &kvm_vfio_ops)
+			return -EBUSY;
+
+	kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+	if (!kv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&kv->group_list);
+	mutex_init(&kv->lock);
+
+	dev->private = kv;
+
+	return 0;
+}
+
+struct kvm_device_ops kvm_vfio_ops = {
+	.name = "kvm-vfio",
+	.create = kvm_vfio_create,
+	.destroy = kvm_vfio_destroy,
+	.set_attr = kvm_vfio_set_attr,
+	.has_attr = kvm_vfio_has_attr,
+};
-- 
cgit v1.2.3


From 5eb26b156e29eadcc21f73fb5d14497f0db24b86 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Wed, 23 Oct 2013 01:44:59 -0700
Subject: openvswitch: TCP flags matching support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    tcp_flags=flags/mask
        Bitwise  match on TCP flags.  The flags and mask are 16-bit num‐
        bers written in decimal or in hexadecimal prefixed by 0x.   Each
        1-bit  in  mask requires that the corresponding bit in port must
        match.  Each 0-bit in mask causes the corresponding  bit  to  be
        ignored.

        TCP  protocol  currently  defines  9 flag bits, and additional 3
        bits are reserved (must be transmitted as zero), see  RFCs  793,
        3168, and 3540.  The flag bits are, numbering from the least
        significant bit:

        0: FIN No more data from sender.

        1: SYN Synchronize sequence numbers.

        2: RST Reset the connection.

        3: PSH Push function.

        4: ACK Acknowledgement field significant.

        5: URG Urgent pointer field significant.

        6: ECE ECN Echo.

        7: CWR Congestion Windows Reduced.

        8: NS  Nonce Sum.

        9-11:  Reserved.

        12-15: Not matchable, must be zero.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/flow.c           |  2 ++
 net/openvswitch/flow.h           |  2 ++
 net/openvswitch/flow_netlink.c   | 31 +++++++++++++++++++++++++++++--
 4 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 2cc4644f68ef..d120f9fe0017 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -271,6 +271,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
 	OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
 	OVS_KEY_ATTR_SCTP,      /* struct ovs_key_sctp */
+	OVS_KEY_ATTR_TCP_FLAGS,	/* be16 TCP flags. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index b73c7680a3d2..b409f5279601 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -428,6 +428,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv4.tp.src = tcp->source;
 				key->ipv4.tp.dst = tcp->dest;
+				key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
 			}
 		} else if (key->ip.proto == IPPROTO_UDP) {
 			if (udphdr_ok(skb)) {
@@ -496,6 +497,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv6.tp.src = tcp->source;
 				key->ipv6.tp.dst = tcp->dest;
+				key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
 			}
 		} else if (key->ip.proto == NEXTHDR_UDP) {
 			if (udphdr_ok(skb)) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 204e0ccd116d..1510f51dbf74 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -93,6 +93,7 @@ struct sw_flow_key {
 				struct {
 					__be16 src;		/* TCP/UDP/SCTP source port. */
 					__be16 dst;		/* TCP/UDP/SCTP destination port. */
+					__be16 flags;		/* TCP flags. */
 				} tp;
 				struct {
 					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
@@ -109,6 +110,7 @@ struct sw_flow_key {
 			struct {
 				__be16 src;		/* TCP/UDP/SCTP source port. */
 				__be16 dst;		/* TCP/UDP/SCTP destination port. */
+				__be16 flags;		/* TCP flags. */
 			} tp;
 			struct {
 				struct in6_addr target;	/* ND target address. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e04649c56a96..2bc1bc1aca3b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -114,6 +114,7 @@ static bool match_validate(const struct sw_flow_match *match,
 	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
 			| (1 << OVS_KEY_ATTR_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
+			| (1 << OVS_KEY_ATTR_TCP_FLAGS)
 			| (1 << OVS_KEY_ATTR_UDP)
 			| (1 << OVS_KEY_ATTR_SCTP)
 			| (1 << OVS_KEY_ATTR_ICMP)
@@ -154,8 +155,11 @@ static bool match_validate(const struct sw_flow_match *match,
 
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
-				if (match->mask && (match->mask->key.ip.proto == 0xff))
+				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+				if (match->mask && (match->mask->key.ip.proto == 0xff)) {
 					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+				}
 			}
 
 			if (match->key->ip.proto == IPPROTO_ICMP) {
@@ -186,8 +190,11 @@ static bool match_validate(const struct sw_flow_match *match,
 
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
-				if (match->mask && (match->mask->key.ip.proto == 0xff))
+				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+				if (match->mask && (match->mask->key.ip.proto == 0xff)) {
 					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+				}
 			}
 
 			if (match->key->ip.proto == IPPROTO_ICMPV6) {
@@ -235,6 +242,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
 	[OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
 	[OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
+	[OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
 	[OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
 	[OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
 	[OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
@@ -634,6 +642,19 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_TCP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.flags,
+					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+					is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.flags,
+					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+					is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
 		const struct ovs_key_udp *udp_key;
 
@@ -1004,9 +1025,15 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
 			if (swkey->eth.type == htons(ETH_P_IP)) {
 				tcp_key->tcp_src = output->ipv4.tp.src;
 				tcp_key->tcp_dst = output->ipv4.tp.dst;
+				if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+						 output->ipv4.tp.flags))
+					goto nla_put_failure;
 			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
 				tcp_key->tcp_src = output->ipv6.tp.src;
 				tcp_key->tcp_dst = output->ipv6.tp.dst;
+				if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+						 output->ipv6.tp.flags))
+					goto nla_put_failure;
 			}
 		} else if (swkey->ip.proto == IPPROTO_UDP) {
 			struct ovs_key_udp *udp_key;
-- 
cgit v1.2.3


From f421436a591d34fa5279b54a96ac07d70250cc8d Mon Sep 17 00:00:00 2001
From: Arvid Brodin <Arvid.Brodin@xdin.com>
Date: Wed, 30 Oct 2013 21:10:47 +0100
Subject: net/hsr: Add support for the High-availability Seamless Redundancy
 protocol (HSRv0)

High-availability Seamless Redundancy ("HSR") provides instant failover
redundancy for Ethernet networks. It requires a special network topology where
all nodes are connected in a ring (each node having two physical network
interfaces). It is suited for applications that demand high availability and
very short reaction time.

HSR acts on the Ethernet layer, using a registered Ethernet protocol type to
send special HSR frames in both directions over the ring. The driver creates
virtual network interfaces that can be used just like any ordinary Linux
network interface, for IP/TCP/UDP traffic etc. All nodes in the network ring
must be HSR capable.

This code is a "best effort" to comply with the HSR standard as described in
IEC 62439-3:2010 (HSRv0).

Signed-off-by: Arvid Brodin <arvid.brodin@xdin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/hsr_netlink.h |  50 ++++
 include/uapi/linux/if_ether.h    |   1 +
 include/uapi/linux/if_link.h     |  13 +
 net/Kconfig                      |   1 +
 net/Makefile                     |   1 +
 net/hsr/Kconfig                  |  27 ++
 net/hsr/Makefile                 |   7 +
 net/hsr/hsr_device.c             | 596 +++++++++++++++++++++++++++++++++++++++
 net/hsr/hsr_device.h             |  29 ++
 net/hsr/hsr_framereg.c           | 503 +++++++++++++++++++++++++++++++++
 net/hsr/hsr_framereg.h           |  53 ++++
 net/hsr/hsr_main.c               | 469 ++++++++++++++++++++++++++++++
 net/hsr/hsr_main.h               | 166 +++++++++++
 net/hsr/hsr_netlink.c            | 457 ++++++++++++++++++++++++++++++
 net/hsr/hsr_netlink.h            |  30 ++
 15 files changed, 2403 insertions(+)
 create mode 100644 include/uapi/linux/hsr_netlink.h
 create mode 100644 net/hsr/Kconfig
 create mode 100644 net/hsr/Makefile
 create mode 100644 net/hsr/hsr_device.c
 create mode 100644 net/hsr/hsr_device.h
 create mode 100644 net/hsr/hsr_framereg.c
 create mode 100644 net/hsr/hsr_framereg.h
 create mode 100644 net/hsr/hsr_main.c
 create mode 100644 net/hsr/hsr_main.h
 create mode 100644 net/hsr/hsr_netlink.c
 create mode 100644 net/hsr/hsr_netlink.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/hsr_netlink.h b/include/uapi/linux/hsr_netlink.h
new file mode 100644
index 000000000000..2475cb8a53af
--- /dev/null
+++ b/include/uapi/linux/hsr_netlink.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef __UAPI_HSR_NETLINK_H
+#define __UAPI_HSR_NETLINK_H
+
+/* Generic Netlink HSR family definition
+ */
+
+/* attributes */
+enum {
+	HSR_A_UNSPEC,
+	HSR_A_NODE_ADDR,
+	HSR_A_IFINDEX,
+	HSR_A_IF1_AGE,
+	HSR_A_IF2_AGE,
+	HSR_A_NODE_ADDR_B,
+	HSR_A_IF1_SEQ,
+	HSR_A_IF2_SEQ,
+	HSR_A_IF1_IFINDEX,
+	HSR_A_IF2_IFINDEX,
+	HSR_A_ADDR_B_IFINDEX,
+	__HSR_A_MAX,
+};
+#define HSR_A_MAX (__HSR_A_MAX - 1)
+
+
+/* commands */
+enum {
+	HSR_C_UNSPEC,
+	HSR_C_RING_ERROR,
+	HSR_C_NODE_DOWN,
+	HSR_C_GET_NODE_STATUS,
+	HSR_C_SET_NODE_STATUS,
+	HSR_C_GET_NODE_LIST,
+	HSR_C_SET_NODE_LIST,
+	__HSR_C_MAX,
+};
+#define HSR_C_MAX (__HSR_C_MAX - 1)
+
+#endif /* __UAPI_HSR_NETLINK_H */
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index ade07f1c491a..2ce0f6a78fa5 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -85,6 +85,7 @@
 #define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_MVRP	0x88F5          /* 802.1Q MVRP                  */
 #define ETH_P_1588	0x88F7		/* IEEE 1588 Timesync */
+#define ETH_P_PRP	0x88FB		/* IEC 62439-3 PRP/HSRv0	*/
 #define ETH_P_FCOE	0x8906		/* Fibre Channel over Ethernet  */
 #define ETH_P_TDLS	0x890D          /* TDLS */
 #define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8a1e346243b7..b78566f59aba 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -481,4 +481,17 @@ enum {
 
 #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
 
+
+/* HSR section */
+
+enum {
+	IFLA_HSR_UNSPEC,
+	IFLA_HSR_SLAVE1,
+	IFLA_HSR_SLAVE2,
+	IFLA_HSR_MULTICAST_SPEC,
+	__IFLA_HSR_MAX,
+};
+
+#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/net/Kconfig b/net/Kconfig
index b50dacc072f0..0715db64a5c3 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -220,6 +220,7 @@ source "net/openvswitch/Kconfig"
 source "net/vmw_vsock/Kconfig"
 source "net/netlink/Kconfig"
 source "net/mpls/Kconfig"
+source "net/hsr/Kconfig"
 
 config RPS
 	boolean
diff --git a/net/Makefile b/net/Makefile
index 9492e8cb64e9..8fa2f91517f1 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -71,3 +71,4 @@ obj-$(CONFIG_NFC)		+= nfc/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_NET_MPLS_GSO)	+= mpls/
+obj-$(CONFIG_HSR)		+= hsr/
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
new file mode 100644
index 000000000000..0d3d709052ca
--- /dev/null
+++ b/net/hsr/Kconfig
@@ -0,0 +1,27 @@
+#
+# IEC 62439-3 High-availability Seamless Redundancy
+#
+
+config HSR
+	tristate "High-availability Seamless Redundancy (HSR)"
+	---help---
+	  If you say Y here, then your Linux box will be able to act as a
+	  DANH ("Doubly attached node implementing HSR"). For this to work,
+	  your Linux box needs (at least) two physical Ethernet interfaces,
+	  and it must be connected as a node in a ring network together with
+	  other HSR capable nodes.
+
+	  All Ethernet frames sent over the hsr device will be sent in both
+	  directions on the ring (over both slave ports), giving a redundant,
+	  instant fail-over network. Each HSR node in the ring acts like a
+	  bridge for HSR frames, but filters frames that have been forwarded
+	  earlier.
+
+	  This code is a "best effort" to comply with the HSR standard as
+	  described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have
+	  been made.
+
+	  You need to perform any and all necessary tests yourself before
+	  relying on this code in a safety critical system!
+
+	  If unsure, say N.
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
new file mode 100644
index 000000000000..b68359f181cc
--- /dev/null
+++ b/net/hsr/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for HSR
+#
+
+obj-$(CONFIG_HSR)	+= hsr.o
+
+hsr-y			:= hsr_main.o hsr_framereg.o hsr_device.o hsr_netlink.o
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
new file mode 100644
index 000000000000..cac505f166d5
--- /dev/null
+++ b/net/hsr/hsr_device.c
@@ -0,0 +1,596 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * This file contains device methods for creating, using and destroying
+ * virtual HSR devices.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+#include "hsr_main.h"
+
+
+static bool is_admin_up(struct net_device *dev)
+{
+	return dev && (dev->flags & IFF_UP);
+}
+
+static bool is_slave_up(struct net_device *dev)
+{
+	return dev && is_admin_up(dev) && netif_oper_up(dev);
+}
+
+static void __hsr_set_operstate(struct net_device *dev, int transition)
+{
+	write_lock_bh(&dev_base_lock);
+	if (dev->operstate != transition) {
+		dev->operstate = transition;
+		write_unlock_bh(&dev_base_lock);
+		netdev_state_change(dev);
+	} else {
+		write_unlock_bh(&dev_base_lock);
+	}
+}
+
+void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1,
+		       struct net_device *slave2)
+{
+	if (!is_admin_up(hsr_dev)) {
+		__hsr_set_operstate(hsr_dev, IF_OPER_DOWN);
+		return;
+	}
+
+	if (is_slave_up(slave1) || is_slave_up(slave2))
+		__hsr_set_operstate(hsr_dev, IF_OPER_UP);
+	else
+		__hsr_set_operstate(hsr_dev, IF_OPER_LOWERLAYERDOWN);
+}
+
+void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1,
+		     struct net_device *slave2)
+{
+	if (is_slave_up(slave1) || is_slave_up(slave2))
+		netif_carrier_on(hsr_dev);
+	else
+		netif_carrier_off(hsr_dev);
+}
+
+
+void hsr_check_announce(struct net_device *hsr_dev, int old_operstate)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = netdev_priv(hsr_dev);
+
+	if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
+		/* Went up */
+		hsr_priv->announce_count = 0;
+		hsr_priv->announce_timer.expires = jiffies +
+				msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+		add_timer(&hsr_priv->announce_timer);
+	}
+
+	if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP))
+		/* Went down */
+		del_timer(&hsr_priv->announce_timer);
+}
+
+
+int hsr_get_max_mtu(struct hsr_priv *hsr_priv)
+{
+	int mtu_max;
+
+	if (hsr_priv->slave[0] && hsr_priv->slave[1])
+		mtu_max = min(hsr_priv->slave[0]->mtu, hsr_priv->slave[1]->mtu);
+	else if (hsr_priv->slave[0])
+		mtu_max = hsr_priv->slave[0]->mtu;
+	else if (hsr_priv->slave[1])
+		mtu_max = hsr_priv->slave[1]->mtu;
+	else
+		mtu_max = HSR_TAGLEN;
+
+	return mtu_max - HSR_TAGLEN;
+}
+
+static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = netdev_priv(dev);
+
+	if (new_mtu > hsr_get_max_mtu(hsr_priv)) {
+		netdev_info(hsr_priv->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+			    HSR_TAGLEN);
+		return -EINVAL;
+	}
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static int hsr_dev_open(struct net_device *dev)
+{
+	struct hsr_priv *hsr_priv;
+	int i;
+	char *slave_name;
+
+	hsr_priv = netdev_priv(dev);
+
+	for (i = 0; i < HSR_MAX_SLAVE; i++) {
+		if (hsr_priv->slave[i])
+			slave_name = hsr_priv->slave[i]->name;
+		else
+			slave_name = "null";
+
+		if (!is_slave_up(hsr_priv->slave[i]))
+			netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a working HSR network\n",
+				    'A' + i, slave_name);
+	}
+
+	return 0;
+}
+
+static int hsr_dev_close(struct net_device *dev)
+{
+	/* Nothing to do here. We could try to restore the state of the slaves
+	 * to what they were before being changed by the hsr master dev's state,
+	 * but they might have been changed manually in the mean time too, so
+	 * taking them up or down here might be confusing and is probably not a
+	 * good idea.
+	 */
+	return 0;
+}
+
+
+static void hsr_fill_tag(struct hsr_ethhdr *hsr_ethhdr, struct hsr_priv *hsr_priv)
+{
+	unsigned long irqflags;
+
+	/* IEC 62439-1:2010, p 48, says the 4-bit "path" field can take values
+	 * between 0001-1001 ("ring identifier", for regular HSR frames),
+	 * or 1111 ("HSR management", supervision frames). Unfortunately, the
+	 * spec writers forgot to explain what a "ring identifier" is, or
+	 * how it is used. So we just set this to 0001 for regular frames,
+	 * and 1111 for supervision frames.
+	 */
+	set_hsr_tag_path(&hsr_ethhdr->hsr_tag, 0x1);
+
+	/* IEC 62439-1:2010, p 12: "The link service data unit in an Ethernet
+	 * frame is the content of the frame located between the Length/Type
+	 * field and the Frame Check Sequence."
+	 *
+	 * IEC 62439-3, p 48, specifies the "original LPDU" to include the
+	 * original "LT" field (what "LT" means is not explained anywhere as
+	 * far as I can see - perhaps "Length/Type"?). So LSDU_size might
+	 * equal original length + 2.
+	 *   Also, the fact that this field is not used anywhere (might be used
+	 * by a RedBox connecting HSR and PRP nets?) means I cannot test its
+	 * correctness. Instead of guessing, I set this to 0 here, to make any
+	 * problems immediately apparent. Anyone using this driver with PRP/HSR
+	 * RedBoxes might need to fix this...
+	 */
+	set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, 0);
+
+	spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags);
+	hsr_ethhdr->hsr_tag.sequence_nr = htons(hsr_priv->sequence_nr);
+	hsr_priv->sequence_nr++;
+	spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags);
+
+	hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
+
+	hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP);
+}
+
+static int slave_xmit(struct sk_buff *skb, struct hsr_priv *hsr_priv,
+		      enum hsr_dev_idx dev_idx)
+{
+	struct hsr_ethhdr *hsr_ethhdr;
+
+	hsr_ethhdr = (struct hsr_ethhdr *) skb->data;
+
+	skb->dev = hsr_priv->slave[dev_idx];
+
+	hsr_addr_subst_dest(hsr_priv, &hsr_ethhdr->ethhdr, dev_idx);
+
+	/* Address substitution (IEC62439-3 pp 26, 50): replace mac
+	 * address of outgoing frame with that of the outgoing slave's.
+	 */
+	memcpy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr, ETH_ALEN);
+
+	return dev_queue_xmit(skb);
+}
+
+
+static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct hsr_priv *hsr_priv;
+	struct hsr_ethhdr *hsr_ethhdr;
+	struct sk_buff *skb2;
+	int res1, res2;
+
+	hsr_priv = netdev_priv(dev);
+	hsr_ethhdr = (struct hsr_ethhdr *) skb->data;
+
+	if ((skb->protocol != htons(ETH_P_PRP)) ||
+	    (hsr_ethhdr->ethhdr.h_proto != htons(ETH_P_PRP))) {
+		hsr_fill_tag(hsr_ethhdr, hsr_priv);
+		skb->protocol = htons(ETH_P_PRP);
+	}
+
+	skb2 = pskb_copy(skb, GFP_ATOMIC);
+
+	res1 = NET_XMIT_DROP;
+	if (likely(hsr_priv->slave[HSR_DEV_SLAVE_A]))
+		res1 = slave_xmit(skb, hsr_priv, HSR_DEV_SLAVE_A);
+
+	res2 = NET_XMIT_DROP;
+	if (likely(skb2 && hsr_priv->slave[HSR_DEV_SLAVE_B]))
+		res2 = slave_xmit(skb2, hsr_priv, HSR_DEV_SLAVE_B);
+
+	if (likely(res1 == NET_XMIT_SUCCESS || res1 == NET_XMIT_CN ||
+		   res2 == NET_XMIT_SUCCESS || res2 == NET_XMIT_CN)) {
+		hsr_priv->dev->stats.tx_packets++;
+		hsr_priv->dev->stats.tx_bytes += skb->len;
+	} else {
+		hsr_priv->dev->stats.tx_dropped++;
+	}
+
+	return NETDEV_TX_OK;
+}
+
+
+static int hsr_header_create(struct sk_buff *skb, struct net_device *dev,
+			     unsigned short type, const void *daddr,
+			     const void *saddr, unsigned int len)
+{
+	int res;
+
+	/* Make room for the HSR tag now. We will fill it in later (in
+	 * hsr_dev_xmit)
+	 */
+	if (skb_headroom(skb) < HSR_TAGLEN + ETH_HLEN)
+		return -ENOBUFS;
+	skb_push(skb, HSR_TAGLEN);
+
+	/* To allow VLAN/HSR combos we should probably use
+	 * res = dev_hard_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN);
+	 * here instead. It would require other changes too, though - e.g.
+	 * separate headers for each slave etc...
+	 */
+	res = eth_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN);
+	if (res <= 0)
+		return res;
+	skb_reset_mac_header(skb);
+
+	return res + HSR_TAGLEN;
+}
+
+
+static const struct header_ops hsr_header_ops = {
+	.create	 = hsr_header_create,
+	.parse	 = eth_header_parse,
+};
+
+
+/* HSR:2010 supervision frames should be padded so that the whole frame,
+ * including headers and FCS, is 64 bytes (without VLAN).
+ */
+static int hsr_pad(int size)
+{
+	const int min_size = ETH_ZLEN - HSR_TAGLEN - ETH_HLEN;
+
+	if (size >= min_size)
+		return size;
+	return min_size;
+}
+
+static void send_hsr_supervision_frame(struct net_device *hsr_dev, u8 type)
+{
+	struct hsr_priv *hsr_priv;
+	struct sk_buff *skb;
+	int hlen, tlen;
+	struct hsr_sup_tag *hsr_stag;
+	struct hsr_sup_payload *hsr_sp;
+	unsigned long irqflags;
+
+	hlen = LL_RESERVED_SPACE(hsr_dev);
+	tlen = hsr_dev->needed_tailroom;
+	skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen,
+			GFP_ATOMIC);
+
+	if (skb == NULL)
+		return;
+
+	hsr_priv = netdev_priv(hsr_dev);
+
+	skb_reserve(skb, hlen);
+
+	skb->dev = hsr_dev;
+	skb->protocol = htons(ETH_P_PRP);
+	skb->priority = TC_PRIO_CONTROL;
+
+	if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+			    hsr_priv->sup_multicast_addr,
+			    skb->dev->dev_addr, skb->len) < 0)
+		goto out;
+
+	skb_pull(skb, sizeof(struct ethhdr));
+	hsr_stag = (typeof(hsr_stag)) skb->data;
+
+	set_hsr_stag_path(hsr_stag, 0xf);
+	set_hsr_stag_HSR_Ver(hsr_stag, 0);
+
+	spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags);
+	hsr_stag->sequence_nr = htons(hsr_priv->sequence_nr);
+	hsr_priv->sequence_nr++;
+	spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags);
+
+	hsr_stag->HSR_TLV_Type = type;
+	hsr_stag->HSR_TLV_Length = 12;
+
+	skb_push(skb, sizeof(struct ethhdr));
+
+	/* Payload: MacAddressA */
+	hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp));
+	memcpy(hsr_sp->MacAddressA, hsr_dev->dev_addr, ETH_ALEN);
+
+	dev_queue_xmit(skb);
+	return;
+
+out:
+	kfree_skb(skb);
+}
+
+
+/* Announce (supervision frame) timer function
+ */
+static void hsr_announce(unsigned long data)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = (struct hsr_priv *) data;
+
+	if (hsr_priv->announce_count < 3) {
+		send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_ANNOUNCE);
+		hsr_priv->announce_count++;
+	} else {
+		send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_LIFE_CHECK);
+	}
+
+	if (hsr_priv->announce_count < 3)
+		hsr_priv->announce_timer.expires = jiffies +
+				msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+	else
+		hsr_priv->announce_timer.expires = jiffies +
+				msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+
+	if (is_admin_up(hsr_priv->dev))
+		add_timer(&hsr_priv->announce_timer);
+}
+
+
+static void restore_slaves(struct net_device *hsr_dev)
+{
+	struct hsr_priv *hsr_priv;
+	int i;
+	int res;
+
+	hsr_priv = netdev_priv(hsr_dev);
+
+	rtnl_lock();
+
+	/* Restore promiscuity */
+	for (i = 0; i < HSR_MAX_SLAVE; i++) {
+		if (!hsr_priv->slave[i])
+			continue;
+		res = dev_set_promiscuity(hsr_priv->slave[i], -1);
+		if (res)
+			netdev_info(hsr_dev,
+				    "Cannot restore slave promiscuity (%s, %d)\n",
+				    hsr_priv->slave[i]->name, res);
+	}
+
+	rtnl_unlock();
+}
+
+static void reclaim_hsr_dev(struct rcu_head *rh)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = container_of(rh, struct hsr_priv, rcu_head);
+	free_netdev(hsr_priv->dev);
+}
+
+
+/* According to comments in the declaration of struct net_device, this function
+ * is "Called from unregister, can be used to call free_netdev". Ok then...
+ */
+static void hsr_dev_destroy(struct net_device *hsr_dev)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = netdev_priv(hsr_dev);
+
+	del_timer(&hsr_priv->announce_timer);
+	unregister_hsr_master(hsr_priv);    /* calls list_del_rcu on hsr_priv */
+	restore_slaves(hsr_dev);
+	call_rcu(&hsr_priv->rcu_head, reclaim_hsr_dev);   /* reclaim hsr_priv */
+}
+
+static const struct net_device_ops hsr_device_ops = {
+	.ndo_change_mtu = hsr_dev_change_mtu,
+	.ndo_open = hsr_dev_open,
+	.ndo_stop = hsr_dev_close,
+	.ndo_start_xmit = hsr_dev_xmit,
+};
+
+
+void hsr_dev_setup(struct net_device *dev)
+{
+	random_ether_addr(dev->dev_addr);
+
+	ether_setup(dev);
+	dev->header_ops		 = &hsr_header_ops;
+	dev->netdev_ops		 = &hsr_device_ops;
+	dev->tx_queue_len	 = 0;
+
+	dev->destructor = hsr_dev_destroy;
+}
+
+
+/* Return true if dev is a HSR master; return false otherwise.
+ */
+bool is_hsr_master(struct net_device *dev)
+{
+	return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
+}
+
+static int check_slave_ok(struct net_device *dev)
+{
+	/* Don't allow HSR on non-ethernet like devices */
+	if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) ||
+	    (dev->addr_len != ETH_ALEN)) {
+		netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n");
+		return -EINVAL;
+	}
+
+	/* Don't allow enslaving hsr devices */
+	if (is_hsr_master(dev)) {
+		netdev_info(dev, "Cannot create trees of HSR devices.\n");
+		return -EINVAL;
+	}
+
+	if (is_hsr_slave(dev)) {
+		netdev_info(dev, "This device is already a HSR slave.\n");
+		return -EINVAL;
+	}
+
+	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+		netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
+		return -EINVAL;
+	}
+
+	/* HSR over bonded devices has not been tested, but I'm not sure it
+	 * won't work...
+	 */
+
+	return 0;
+}
+
+
+/* Default multicast address for HSR Supervision frames */
+static const unsigned char def_multicast_addr[ETH_ALEN] = {
+	0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
+};
+
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+		     unsigned char multicast_spec)
+{
+	struct hsr_priv *hsr_priv;
+	int i;
+	int res;
+
+	hsr_priv = netdev_priv(hsr_dev);
+	hsr_priv->dev = hsr_dev;
+	INIT_LIST_HEAD(&hsr_priv->node_db);
+	INIT_LIST_HEAD(&hsr_priv->self_node_db);
+	for (i = 0; i < HSR_MAX_SLAVE; i++)
+		hsr_priv->slave[i] = slave[i];
+
+	spin_lock_init(&hsr_priv->seqnr_lock);
+	/* Overflow soon to find bugs easier: */
+	hsr_priv->sequence_nr = USHRT_MAX - 1024;
+
+	init_timer(&hsr_priv->announce_timer);
+	hsr_priv->announce_timer.function = hsr_announce;
+	hsr_priv->announce_timer.data = (unsigned long) hsr_priv;
+
+	memcpy(hsr_priv->sup_multicast_addr, def_multicast_addr, ETH_ALEN);
+	hsr_priv->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+
+/* FIXME: should I modify the value of these?
+ *
+ * - hsr_dev->flags - i.e.
+ *			IFF_MASTER/SLAVE?
+ * - hsr_dev->priv_flags - i.e.
+ *			IFF_EBRIDGE?
+ *			IFF_TX_SKB_SHARING?
+ *			IFF_HSR_MASTER/SLAVE?
+ */
+
+	for (i = 0; i < HSR_MAX_SLAVE; i++) {
+		res = check_slave_ok(slave[i]);
+		if (res)
+			return res;
+	}
+
+	hsr_dev->features = slave[0]->features & slave[1]->features;
+	/* Prevent recursive tx locking */
+	hsr_dev->features |= NETIF_F_LLTX;
+	/* VLAN on top of HSR needs testing and probably some work on
+	 * hsr_header_create() etc.
+	 */
+	hsr_dev->features |= NETIF_F_VLAN_CHALLENGED;
+
+	/* Set hsr_dev's MAC address to that of mac_slave1 */
+	memcpy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr, ETH_ALEN);
+
+	/* Set required header length */
+	for (i = 0; i < HSR_MAX_SLAVE; i++) {
+		if (slave[i]->hard_header_len + HSR_TAGLEN >
+						hsr_dev->hard_header_len)
+			hsr_dev->hard_header_len =
+					slave[i]->hard_header_len + HSR_TAGLEN;
+	}
+
+	/* MTU */
+	for (i = 0; i < HSR_MAX_SLAVE; i++)
+		if (slave[i]->mtu - HSR_TAGLEN < hsr_dev->mtu)
+			hsr_dev->mtu = slave[i]->mtu - HSR_TAGLEN;
+
+	/* Make sure the 1st call to netif_carrier_on() gets through */
+	netif_carrier_off(hsr_dev);
+
+	/* Promiscuity */
+	for (i = 0; i < HSR_MAX_SLAVE; i++) {
+		res = dev_set_promiscuity(slave[i], 1);
+		if (res) {
+			netdev_info(hsr_dev, "Cannot set slave promiscuity (%s, %d)\n",
+				    slave[i]->name, res);
+			goto fail;
+		}
+	}
+
+	/* Make sure we recognize frames from ourselves in hsr_rcv() */
+	res = hsr_create_self_node(&hsr_priv->self_node_db,
+					hsr_dev->dev_addr,
+					hsr_priv->slave[1]->dev_addr);
+	if (res < 0)
+		goto fail;
+
+	res = register_netdevice(hsr_dev);
+	if (res)
+		goto fail;
+
+	register_hsr_master(hsr_priv);
+
+	return 0;
+
+fail:
+	restore_slaves(hsr_dev);
+	return res;
+}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
new file mode 100644
index 000000000000..2c7148e73914
--- /dev/null
+++ b/net/hsr/hsr_device.h
@@ -0,0 +1,29 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef __HSR_DEVICE_H
+#define __HSR_DEVICE_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_dev_setup(struct net_device *dev);
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+		     unsigned char multicast_spec);
+void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1,
+		       struct net_device *slave2);
+void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1,
+		     struct net_device *slave2);
+void hsr_check_announce(struct net_device *hsr_dev, int old_operstate);
+bool is_hsr_master(struct net_device *dev);
+int hsr_get_max_mtu(struct hsr_priv *hsr_priv);
+
+#endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
new file mode 100644
index 000000000000..003f5bb3acd2
--- /dev/null
+++ b/net/hsr/hsr_framereg.c
@@ -0,0 +1,503 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * The HSR spec says never to forward the same frame twice on the same
+ * interface. A frame is identified by its source MAC address and its HSR
+ * sequence number. This code keeps track of senders and their sequence numbers
+ * to allow filtering of duplicate frames, and to detect HSR ring errors.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+#include "hsr_netlink.h"
+
+
+struct node_entry {
+	struct list_head mac_list;
+	unsigned char	MacAddressA[ETH_ALEN];
+	unsigned char	MacAddressB[ETH_ALEN];
+	enum hsr_dev_idx   AddrB_if;	/* The local slave through which AddrB
+					 * frames are received from this node
+					 */
+	unsigned long	time_in[HSR_MAX_SLAVE];
+	bool		time_in_stale[HSR_MAX_SLAVE];
+	u16		seq_out[HSR_MAX_DEV];
+	struct rcu_head rcu_head;
+};
+
+/*	TODO: use hash lists for mac addresses (linux/jhash.h)?    */
+
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct node_entry *find_node_by_AddrA(struct list_head *node_db,
+					     const unsigned char addr[ETH_ALEN])
+{
+	struct node_entry *node;
+
+	list_for_each_entry_rcu(node, node_db, mac_list) {
+		if (ether_addr_equal(node->MacAddressA, addr))
+			return node;
+	}
+
+	return NULL;
+}
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct node_entry *find_node_by_AddrB(struct list_head *node_db,
+					     const unsigned char addr[ETH_ALEN])
+{
+	struct node_entry *node;
+
+	list_for_each_entry_rcu(node, node_db, mac_list) {
+		if (ether_addr_equal(node->MacAddressB, addr))
+			return node;
+	}
+
+	return NULL;
+}
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb)
+{
+	struct node_entry *node;
+	struct ethhdr *ethhdr;
+
+	if (!skb_mac_header_was_set(skb))
+		return NULL;
+
+	ethhdr = (struct ethhdr *) skb_mac_header(skb);
+
+	list_for_each_entry_rcu(node, node_db, mac_list) {
+		if (ether_addr_equal(node->MacAddressA, ethhdr->h_source))
+			return node;
+		if (ether_addr_equal(node->MacAddressB, ethhdr->h_source))
+			return node;
+	}
+
+	return NULL;
+}
+
+
+/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize
+ * frames from self that's been looped over the HSR ring.
+ */
+int hsr_create_self_node(struct list_head *self_node_db,
+			 unsigned char addr_a[ETH_ALEN],
+			 unsigned char addr_b[ETH_ALEN])
+{
+	struct node_entry *node, *oldnode;
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	memcpy(node->MacAddressA, addr_a, ETH_ALEN);
+	memcpy(node->MacAddressB, addr_b, ETH_ALEN);
+
+	rcu_read_lock();
+	oldnode = list_first_or_null_rcu(self_node_db,
+						struct node_entry, mac_list);
+	if (oldnode) {
+		list_replace_rcu(&oldnode->mac_list, &node->mac_list);
+		rcu_read_unlock();
+		synchronize_rcu();
+		kfree(oldnode);
+	} else {
+		rcu_read_unlock();
+		list_add_tail_rcu(&node->mac_list, self_node_db);
+	}
+
+	return 0;
+}
+
+static void node_entry_reclaim(struct rcu_head *rh)
+{
+	kfree(container_of(rh, struct node_entry, rcu_head));
+}
+
+
+/* Add/merge node to the database of nodes. 'skb' must contain an HSR
+ * supervision frame.
+ * - If the supervision header's MacAddressA field is not yet in the database,
+ * this frame is from an hitherto unknown node - add it to the database.
+ * - If the sender's MAC address is not the same as its MacAddressA address,
+ * the node is using PICS_SUBS (address substitution). Record the sender's
+ * address as the node's MacAddressB.
+ *
+ * This function needs to work even if the sender node has changed one of its
+ * slaves' MAC addresses. In this case, there are four different cases described
+ * by (Addr-changed, received-from) pairs as follows. Note that changing the
+ * SlaveA address is equal to changing the node's own address:
+ *
+ * - (AddrB, SlaveB): The new AddrB will be recorded by PICS_SUBS code since
+ *		      node == NULL.
+ * - (AddrB, SlaveA): Will work as usual (the AddrB change won't be detected
+ *		      from this frame).
+ *
+ * - (AddrA, SlaveB): The old node will be found. We need to detect this and
+ *		      remove the node.
+ * - (AddrA, SlaveA): A new node will be registered (non-PICS_SUBS at first).
+ *		      The old one will be pruned after HSR_NODE_FORGET_TIME.
+ *
+ * We also need to detect if the sender's SlaveA and SlaveB cables have been
+ * swapped.
+ */
+struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv,
+				  struct node_entry *node,
+				  struct sk_buff *skb,
+				  enum hsr_dev_idx dev_idx)
+{
+	struct hsr_sup_payload *hsr_sp;
+	struct hsr_ethhdr_sp *hsr_ethsup;
+	int i;
+	unsigned long now;
+
+	hsr_ethsup = (struct hsr_ethhdr_sp *) skb_mac_header(skb);
+	hsr_sp = (struct hsr_sup_payload *) skb->data;
+
+	if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) {
+		/* Node has changed its AddrA, frame was received from SlaveB */
+		list_del_rcu(&node->mac_list);
+		call_rcu(&node->rcu_head, node_entry_reclaim);
+		node = NULL;
+	}
+
+	if (node && (dev_idx == node->AddrB_if) &&
+	    !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) {
+		/* Cables have been swapped */
+		list_del_rcu(&node->mac_list);
+		call_rcu(&node->rcu_head, node_entry_reclaim);
+		node = NULL;
+	}
+
+	if (node && (dev_idx != node->AddrB_if) &&
+	    (node->AddrB_if != HSR_DEV_NONE) &&
+	    !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) {
+		/* Cables have been swapped */
+		list_del_rcu(&node->mac_list);
+		call_rcu(&node->rcu_head, node_entry_reclaim);
+		node = NULL;
+	}
+
+	if (node)
+		return node;
+
+	node = find_node_by_AddrA(&hsr_priv->node_db, hsr_sp->MacAddressA);
+	if (node) {
+		/* Node is known, but frame was received from an unknown
+		 * address. Node is PICS_SUBS capable; merge its AddrB.
+		 */
+		memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN);
+		node->AddrB_if = dev_idx;
+		return node;
+	}
+
+	node = kzalloc(sizeof(*node), GFP_ATOMIC);
+	if (!node)
+		return NULL;
+
+	memcpy(node->MacAddressA, hsr_sp->MacAddressA, ETH_ALEN);
+	memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN);
+	if (!ether_addr_equal(hsr_sp->MacAddressA, hsr_ethsup->ethhdr.h_source))
+		node->AddrB_if = dev_idx;
+	else
+		node->AddrB_if = HSR_DEV_NONE;
+
+	/* We are only interested in time diffs here, so use current jiffies
+	 * as initialization. (0 could trigger an spurious ring error warning).
+	 */
+	now = jiffies;
+	for (i = 0; i < HSR_MAX_SLAVE; i++)
+		node->time_in[i] = now;
+	for (i = 0; i < HSR_MAX_DEV; i++)
+		node->seq_out[i] = ntohs(hsr_ethsup->hsr_sup.sequence_nr) - 1;
+
+	list_add_tail_rcu(&node->mac_list, &hsr_priv->node_db);
+
+	return node;
+}
+
+
+/* 'skb' is a frame meant for this host, that is to be passed to upper layers.
+ *
+ * If the frame was sent by a node's B interface, replace the sender
+ * address with that node's "official" address (MacAddressA) so that upper
+ * layers recognize where it came from.
+ */
+void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb)
+{
+	struct ethhdr *ethhdr;
+	struct node_entry *node;
+
+	if (!skb_mac_header_was_set(skb)) {
+		WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+		return;
+	}
+	ethhdr = (struct ethhdr *) skb_mac_header(skb);
+
+	rcu_read_lock();
+	node = find_node_by_AddrB(&hsr_priv->node_db, ethhdr->h_source);
+	if (node)
+		memcpy(ethhdr->h_source, node->MacAddressA, ETH_ALEN);
+	rcu_read_unlock();
+}
+
+
+/* 'skb' is a frame meant for another host.
+ * 'hsr_dev_idx' is the HSR index of the outgoing device
+ *
+ * Substitute the target (dest) MAC address if necessary, so the it matches the
+ * recipient interface MAC address, regardless of whether that is the
+ * recipient's A or B interface.
+ * This is needed to keep the packets flowing through switches that learn on
+ * which "side" the different interfaces are.
+ */
+void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr,
+			 enum hsr_dev_idx dev_idx)
+{
+	struct node_entry *node;
+
+	rcu_read_lock();
+	node = find_node_by_AddrA(&hsr_priv->node_db, ethhdr->h_dest);
+	if (node && (node->AddrB_if == dev_idx))
+		memcpy(ethhdr->h_dest, node->MacAddressB, ETH_ALEN);
+	rcu_read_unlock();
+}
+
+
+/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
+ * false otherwise.
+ */
+static bool seq_nr_after(u16 a, u16 b)
+{
+	/* Remove inconsistency where
+	 * seq_nr_after(a, b) == seq_nr_before(a, b) */
+	if ((int) b - a == 32768)
+		return false;
+
+	return (((s16) (b - a)) < 0);
+}
+#define seq_nr_before(a, b)		seq_nr_after((b), (a))
+#define seq_nr_after_or_eq(a, b)	(!seq_nr_before((a), (b)))
+#define seq_nr_before_or_eq(a, b)	(!seq_nr_after((a), (b)))
+
+
+void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx)
+{
+	if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) {
+		WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx);
+		return;
+	}
+	node->time_in[dev_idx] = jiffies;
+	node->time_in_stale[dev_idx] = false;
+}
+
+
+/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid
+ * ethhdr->h_source address and skb->mac_header set.
+ *
+ * Return:
+ *	 1 if frame can be shown to have been sent recently on this interface,
+ *	 0 otherwise, or
+ *	 negative error code on error
+ */
+int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx,
+			   struct sk_buff *skb)
+{
+	struct hsr_ethhdr *hsr_ethhdr;
+	u16 sequence_nr;
+
+	if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) {
+		WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx);
+		return -EINVAL;
+	}
+	if (!skb_mac_header_was_set(skb)) {
+		WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+		return -EINVAL;
+	}
+	hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+
+	sequence_nr = ntohs(hsr_ethhdr->hsr_tag.sequence_nr);
+	if (seq_nr_before_or_eq(sequence_nr, node->seq_out[dev_idx]))
+		return 1;
+
+	node->seq_out[dev_idx] = sequence_nr;
+	return 0;
+}
+
+
+
+static bool is_late(struct node_entry *node, enum hsr_dev_idx dev_idx)
+{
+	enum hsr_dev_idx other;
+
+	if (node->time_in_stale[dev_idx])
+		return true;
+
+	if (dev_idx == HSR_DEV_SLAVE_A)
+		other = HSR_DEV_SLAVE_B;
+	else
+		other = HSR_DEV_SLAVE_A;
+
+	if (node->time_in_stale[other])
+		return false;
+
+	if (time_after(node->time_in[other], node->time_in[dev_idx] +
+		       msecs_to_jiffies(MAX_SLAVE_DIFF)))
+		return true;
+
+	return false;
+}
+
+
+/* Remove stale sequence_nr records. Called by timer every
+ * HSR_LIFE_CHECK_INTERVAL (two seconds or so).
+ */
+void hsr_prune_nodes(struct hsr_priv *hsr_priv)
+{
+	struct node_entry *node;
+	unsigned long timestamp;
+	unsigned long time_a, time_b;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(node, &hsr_priv->node_db, mac_list) {
+		/* Shorthand */
+		time_a = node->time_in[HSR_DEV_SLAVE_A];
+		time_b = node->time_in[HSR_DEV_SLAVE_B];
+
+		/* Check for timestamps old enough to risk wrap-around */
+		if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2))
+			node->time_in_stale[HSR_DEV_SLAVE_A] = true;
+		if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2))
+			node->time_in_stale[HSR_DEV_SLAVE_B] = true;
+
+		/* Get age of newest frame from node.
+		 * At least one time_in is OK here; nodes get pruned long
+		 * before both time_ins can get stale
+		 */
+		timestamp = time_a;
+		if (node->time_in_stale[HSR_DEV_SLAVE_A] ||
+		    (!node->time_in_stale[HSR_DEV_SLAVE_B] &&
+		    time_after(time_b, time_a)))
+			timestamp = time_b;
+
+		/* Warn of ring error only as long as we get frames at all */
+		if (time_is_after_jiffies(timestamp +
+					msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) {
+
+			if (is_late(node, HSR_DEV_SLAVE_A))
+				hsr_nl_ringerror(hsr_priv, node->MacAddressA,
+						 HSR_DEV_SLAVE_A);
+			else if (is_late(node, HSR_DEV_SLAVE_B))
+				hsr_nl_ringerror(hsr_priv, node->MacAddressA,
+						 HSR_DEV_SLAVE_B);
+		}
+
+		/* Prune old entries */
+		if (time_is_before_jiffies(timestamp +
+					msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+			hsr_nl_nodedown(hsr_priv, node->MacAddressA);
+			list_del_rcu(&node->mac_list);
+			/* Note that we need to free this entry later: */
+			call_rcu(&node->rcu_head, node_entry_reclaim);
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos,
+			unsigned char addr[ETH_ALEN])
+{
+	struct node_entry *node;
+
+	if (!_pos) {
+		node = list_first_or_null_rcu(&hsr_priv->node_db,
+						struct node_entry, mac_list);
+		if (node)
+			memcpy(addr, node->MacAddressA, ETH_ALEN);
+		return node;
+	}
+
+	node = _pos;
+	list_for_each_entry_continue_rcu(node, &hsr_priv->node_db, mac_list) {
+		memcpy(addr, node->MacAddressA, ETH_ALEN);
+		return node;
+	}
+
+	return NULL;
+}
+
+
+int hsr_get_node_data(struct hsr_priv *hsr_priv,
+		      const unsigned char *addr,
+		      unsigned char addr_b[ETH_ALEN],
+		      unsigned int *addr_b_ifindex,
+		      int *if1_age,
+		      u16 *if1_seq,
+		      int *if2_age,
+		      u16 *if2_seq)
+{
+	struct node_entry *node;
+	unsigned long tdiff;
+
+
+	rcu_read_lock();
+	node = find_node_by_AddrA(&hsr_priv->node_db, addr);
+	if (!node) {
+		rcu_read_unlock();
+		return -ENOENT;	/* No such entry */
+	}
+
+	memcpy(addr_b, node->MacAddressB, ETH_ALEN);
+
+	tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_A];
+	if (node->time_in_stale[HSR_DEV_SLAVE_A])
+		*if1_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+	else if (tdiff > msecs_to_jiffies(INT_MAX))
+		*if1_age = INT_MAX;
+#endif
+	else
+		*if1_age = jiffies_to_msecs(tdiff);
+
+	tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_B];
+	if (node->time_in_stale[HSR_DEV_SLAVE_B])
+		*if2_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+	else if (tdiff > msecs_to_jiffies(INT_MAX))
+		*if2_age = INT_MAX;
+#endif
+	else
+		*if2_age = jiffies_to_msecs(tdiff);
+
+	/* Present sequence numbers as if they were incoming on interface */
+	*if1_seq = node->seq_out[HSR_DEV_SLAVE_B];
+	*if2_seq = node->seq_out[HSR_DEV_SLAVE_A];
+
+	if ((node->AddrB_if != HSR_DEV_NONE) && hsr_priv->slave[node->AddrB_if])
+		*addr_b_ifindex = hsr_priv->slave[node->AddrB_if]->ifindex;
+	else
+		*addr_b_ifindex = -1;
+
+	rcu_read_unlock();
+
+	return 0;
+}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
new file mode 100644
index 000000000000..e6c4022030ad
--- /dev/null
+++ b/net/hsr/hsr_framereg.h
@@ -0,0 +1,53 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef _HSR_FRAMEREG_H
+#define _HSR_FRAMEREG_H
+
+#include "hsr_main.h"
+
+struct node_entry;
+
+struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb);
+
+struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv,
+				  struct node_entry *node,
+				  struct sk_buff *skb,
+				  enum hsr_dev_idx dev_idx);
+
+void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb);
+void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr,
+			 enum hsr_dev_idx dev_idx);
+
+void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx);
+
+int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx,
+			   struct sk_buff *skb);
+
+void hsr_prune_nodes(struct hsr_priv *hsr_priv);
+
+int hsr_create_self_node(struct list_head *self_node_db,
+			 unsigned char addr_a[ETH_ALEN],
+			 unsigned char addr_b[ETH_ALEN]);
+
+void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos,
+			unsigned char addr[ETH_ALEN]);
+
+int hsr_get_node_data(struct hsr_priv *hsr_priv,
+		      const unsigned char *addr,
+		      unsigned char addr_b[ETH_ALEN],
+		      unsigned int *addr_b_ifindex,
+		      int *if1_age,
+		      u16 *if1_seq,
+		      int *if2_age,
+		      u16 *if2_seq);
+
+#endif /* _HSR_FRAMEREG_H */
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
new file mode 100644
index 000000000000..af68dd83a4e3
--- /dev/null
+++ b/net/hsr/hsr_main.c
@@ -0,0 +1,469 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * In addition to routines for registering and unregistering HSR support, this
+ * file also contains the receive routine that handles all incoming frames with
+ * Ethertype (protocol) ETH_P_PRP (HSRv0), and network device event handling.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/timer.h>
+#include <linux/etherdevice.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_netlink.h"
+#include "hsr_framereg.h"
+
+
+/* List of all registered virtual HSR devices */
+static LIST_HEAD(hsr_list);
+
+void register_hsr_master(struct hsr_priv *hsr_priv)
+{
+	list_add_tail_rcu(&hsr_priv->hsr_list, &hsr_list);
+}
+
+void unregister_hsr_master(struct hsr_priv *hsr_priv)
+{
+	struct hsr_priv *hsr_priv_it;
+
+	list_for_each_entry(hsr_priv_it, &hsr_list, hsr_list)
+		if (hsr_priv_it == hsr_priv) {
+			list_del_rcu(&hsr_priv_it->hsr_list);
+			return;
+		}
+}
+
+bool is_hsr_slave(struct net_device *dev)
+{
+	struct hsr_priv *hsr_priv_it;
+
+	list_for_each_entry_rcu(hsr_priv_it, &hsr_list, hsr_list) {
+		if (dev == hsr_priv_it->slave[0])
+			return true;
+		if (dev == hsr_priv_it->slave[1])
+			return true;
+	}
+
+	return false;
+}
+
+
+/* If dev is a HSR slave device, return the virtual master device. Return NULL
+ * otherwise.
+ */
+static struct hsr_priv *get_hsr_master(struct net_device *dev)
+{
+	struct hsr_priv *hsr_priv;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list)
+		if ((dev == hsr_priv->slave[0]) ||
+		    (dev == hsr_priv->slave[1])) {
+			rcu_read_unlock();
+			return hsr_priv;
+		}
+
+	rcu_read_unlock();
+	return NULL;
+}
+
+
+/* If dev is a HSR slave device, return the other slave device. Return NULL
+ * otherwise.
+ */
+static struct net_device *get_other_slave(struct hsr_priv *hsr_priv,
+					  struct net_device *dev)
+{
+	if (dev == hsr_priv->slave[0])
+		return hsr_priv->slave[1];
+	if (dev == hsr_priv->slave[1])
+		return hsr_priv->slave[0];
+
+	return NULL;
+}
+
+
+static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
+			     void *ptr)
+{
+	struct net_device *slave, *other_slave;
+	struct hsr_priv *hsr_priv;
+	int old_operstate;
+	int mtu_max;
+	int res;
+	struct net_device *dev;
+
+	dev = netdev_notifier_info_to_dev(ptr);
+
+	hsr_priv = get_hsr_master(dev);
+	if (hsr_priv) {
+		/* dev is a slave device */
+		slave = dev;
+		other_slave = get_other_slave(hsr_priv, slave);
+	} else {
+		if (!is_hsr_master(dev))
+			return NOTIFY_DONE;
+		hsr_priv = netdev_priv(dev);
+		slave = hsr_priv->slave[0];
+		other_slave = hsr_priv->slave[1];
+	}
+
+	switch (event) {
+	case NETDEV_UP:		/* Administrative state DOWN */
+	case NETDEV_DOWN:	/* Administrative state UP */
+	case NETDEV_CHANGE:	/* Link (carrier) state changes */
+		old_operstate = hsr_priv->dev->operstate;
+		hsr_set_carrier(hsr_priv->dev, slave, other_slave);
+		/* netif_stacked_transfer_operstate() cannot be used here since
+		 * it doesn't set IF_OPER_LOWERLAYERDOWN (?)
+		 */
+		hsr_set_operstate(hsr_priv->dev, slave, other_slave);
+		hsr_check_announce(hsr_priv->dev, old_operstate);
+		break;
+	case NETDEV_CHANGEADDR:
+
+		/* This should not happen since there's no ndo_set_mac_address()
+		 * for HSR devices - i.e. not supported.
+		 */
+		if (dev == hsr_priv->dev)
+			break;
+
+		if (dev == hsr_priv->slave[0])
+			memcpy(hsr_priv->dev->dev_addr,
+			       hsr_priv->slave[0]->dev_addr, ETH_ALEN);
+
+		/* Make sure we recognize frames from ourselves in hsr_rcv() */
+		res = hsr_create_self_node(&hsr_priv->self_node_db,
+					   hsr_priv->dev->dev_addr,
+					   hsr_priv->slave[1] ?
+						hsr_priv->slave[1]->dev_addr :
+						hsr_priv->dev->dev_addr);
+		if (res)
+			netdev_warn(hsr_priv->dev,
+				    "Could not update HSR node address.\n");
+
+		if (dev == hsr_priv->slave[0])
+			call_netdevice_notifiers(NETDEV_CHANGEADDR, hsr_priv->dev);
+		break;
+	case NETDEV_CHANGEMTU:
+		if (dev == hsr_priv->dev)
+			break; /* Handled in ndo_change_mtu() */
+		mtu_max = hsr_get_max_mtu(hsr_priv);
+		if (hsr_priv->dev->mtu > mtu_max)
+			dev_set_mtu(hsr_priv->dev, mtu_max);
+		break;
+	case NETDEV_UNREGISTER:
+		if (dev == hsr_priv->slave[0])
+			hsr_priv->slave[0] = NULL;
+		if (dev == hsr_priv->slave[1])
+			hsr_priv->slave[1] = NULL;
+
+		/* There should really be a way to set a new slave device... */
+
+		break;
+	case NETDEV_PRE_TYPE_CHANGE:
+		/* HSR works only on Ethernet devices. Refuse slave to change
+		 * its type.
+		 */
+		return NOTIFY_BAD;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+static struct timer_list prune_timer;
+
+static void prune_nodes_all(unsigned long data)
+{
+	struct hsr_priv *hsr_priv;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list)
+		hsr_prune_nodes(hsr_priv);
+	rcu_read_unlock();
+
+	prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
+	add_timer(&prune_timer);
+}
+
+
+static struct sk_buff *hsr_pull_tag(struct sk_buff *skb)
+{
+	struct hsr_tag *hsr_tag;
+	struct sk_buff *skb2;
+
+	skb2 = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb2))
+		goto err_free;
+	skb = skb2;
+
+	if (unlikely(!pskb_may_pull(skb, HSR_TAGLEN)))
+		goto err_free;
+
+	hsr_tag = (struct hsr_tag *) skb->data;
+	skb->protocol = hsr_tag->encap_proto;
+	skb_pull(skb, HSR_TAGLEN);
+
+	return skb;
+
+err_free:
+	kfree_skb(skb);
+	return NULL;
+}
+
+
+/* The uses I can see for these HSR supervision frames are:
+ * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type =
+ *    22") to reset any sequence_nr counters belonging to that node. Useful if
+ *    the other node's counter has been reset for some reason.
+ *    --
+ *    Or not - resetting the counter and bridging the frame would create a
+ *    loop, unfortunately.
+ *
+ * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck
+ *    frame is received from a particular node, we know something is wrong.
+ *    We just register these (as with normal frames) and throw them away.
+ *
+ * 3) Allow different MAC addresses for the two slave interfaces, using the
+ *    MacAddressA field.
+ */
+static bool is_supervision_frame(struct hsr_priv *hsr_priv, struct sk_buff *skb)
+{
+	struct hsr_sup_tag *hsr_stag;
+
+	if (!ether_addr_equal(eth_hdr(skb)->h_dest,
+			      hsr_priv->sup_multicast_addr))
+		return false;
+
+	hsr_stag = (struct hsr_sup_tag *) skb->data;
+	if (get_hsr_stag_path(hsr_stag) != 0x0f)
+		return false;
+	if ((hsr_stag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
+	    (hsr_stag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+		return false;
+	if (hsr_stag->HSR_TLV_Length != 12)
+		return false;
+
+	return true;
+}
+
+
+/* Implementation somewhat according to IEC-62439-3, p. 43
+ */
+static int hsr_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct hsr_priv *hsr_priv;
+	struct net_device *other_slave;
+	struct node_entry *node;
+	bool deliver_to_self;
+	struct sk_buff *skb_deliver;
+	enum hsr_dev_idx dev_in_idx, dev_other_idx;
+	bool dup_out;
+	int ret;
+
+	hsr_priv = get_hsr_master(dev);
+
+	if (!hsr_priv) {
+		/* Non-HSR-slave device 'dev' is connected to a HSR network */
+		kfree_skb(skb);
+		dev->stats.rx_errors++;
+		return NET_RX_SUCCESS;
+	}
+
+	if (dev == hsr_priv->slave[0]) {
+		dev_in_idx = HSR_DEV_SLAVE_A;
+		dev_other_idx = HSR_DEV_SLAVE_B;
+	} else {
+		dev_in_idx = HSR_DEV_SLAVE_B;
+		dev_other_idx = HSR_DEV_SLAVE_A;
+	}
+
+	node = hsr_find_node(&hsr_priv->self_node_db, skb);
+	if (node) {
+		/* Always kill frames sent by ourselves */
+		kfree_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+	/* Is this frame a candidate for local reception? */
+	deliver_to_self = false;
+	if ((skb->pkt_type == PACKET_HOST) ||
+	    (skb->pkt_type == PACKET_MULTICAST) ||
+	    (skb->pkt_type == PACKET_BROADCAST))
+		deliver_to_self = true;
+	else if (ether_addr_equal(eth_hdr(skb)->h_dest,
+				     hsr_priv->dev->dev_addr)) {
+		skb->pkt_type = PACKET_HOST;
+		deliver_to_self = true;
+	}
+
+
+	rcu_read_lock(); /* node_db */
+	node = hsr_find_node(&hsr_priv->node_db, skb);
+
+	if (is_supervision_frame(hsr_priv, skb)) {
+		skb_pull(skb, sizeof(struct hsr_sup_tag));
+		node = hsr_merge_node(hsr_priv, node, skb, dev_in_idx);
+		if (!node) {
+			rcu_read_unlock(); /* node_db */
+			kfree_skb(skb);
+			hsr_priv->dev->stats.rx_dropped++;
+			return NET_RX_DROP;
+		}
+		skb_push(skb, sizeof(struct hsr_sup_tag));
+		deliver_to_self = false;
+	}
+
+	if (!node) {
+		/* Source node unknown; this might be a HSR frame from
+		 * another net (different multicast address). Ignore it.
+		 */
+		rcu_read_unlock(); /* node_db */
+		kfree_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+	/* Register ALL incoming frames as outgoing through the other interface.
+	 * This allows us to register frames as incoming only if they are valid
+	 * for the receiving interface, without using a specific counter for
+	 * incoming frames.
+	 */
+	dup_out = hsr_register_frame_out(node, dev_other_idx, skb);
+	if (!dup_out)
+		hsr_register_frame_in(node, dev_in_idx);
+
+	/* Forward this frame? */
+	if (!dup_out && (skb->pkt_type != PACKET_HOST))
+		other_slave = get_other_slave(hsr_priv, dev);
+	else
+		other_slave = NULL;
+
+	if (hsr_register_frame_out(node, HSR_DEV_MASTER, skb))
+		deliver_to_self = false;
+
+	rcu_read_unlock(); /* node_db */
+
+	if (!deliver_to_self && !other_slave) {
+		kfree_skb(skb);
+		/* Circulated frame; silently remove it. */
+		return NET_RX_SUCCESS;
+	}
+
+	skb_deliver = skb;
+	if (deliver_to_self && other_slave) {
+		/* skb_clone() is not enough since we will strip the hsr tag
+		 * and do address substitution below
+		 */
+		skb_deliver = pskb_copy(skb, GFP_ATOMIC);
+		if (!skb_deliver) {
+			deliver_to_self = false;
+			hsr_priv->dev->stats.rx_dropped++;
+		}
+	}
+
+	if (deliver_to_self) {
+		bool multicast_frame;
+
+		skb_deliver = hsr_pull_tag(skb_deliver);
+		if (!skb_deliver) {
+			hsr_priv->dev->stats.rx_dropped++;
+			goto forward;
+		}
+#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+		/* Move everything in the header that is after the HSR tag,
+		 * to work around alignment problems caused by the 6-byte HSR
+		 * tag. In practice, this removes/overwrites the HSR tag in
+		 * the header and restores a "standard" packet.
+		 */
+		memmove(skb_deliver->data - HSR_TAGLEN, skb_deliver->data,
+			skb_headlen(skb_deliver));
+
+		/* Adjust skb members so they correspond with the move above.
+		 * This cannot possibly underflow skb->data since hsr_pull_tag()
+		 * above succeeded.
+		 * At this point in the protocol stack, the transport and
+		 * network headers have not been set yet, and we haven't touched
+		 * the mac header nor the head. So we only need to adjust data
+		 * and tail:
+		 */
+		skb_deliver->data -= HSR_TAGLEN;
+		skb_deliver->tail -= HSR_TAGLEN;
+#endif
+		skb_deliver->dev = hsr_priv->dev;
+		hsr_addr_subst_source(hsr_priv, skb_deliver);
+		multicast_frame = (skb_deliver->pkt_type == PACKET_MULTICAST);
+		ret = netif_rx(skb_deliver);
+		if (ret == NET_RX_DROP) {
+			hsr_priv->dev->stats.rx_dropped++;
+		} else {
+			hsr_priv->dev->stats.rx_packets++;
+			hsr_priv->dev->stats.rx_bytes += skb->len;
+			if (multicast_frame)
+				hsr_priv->dev->stats.multicast++;
+		}
+	}
+
+forward:
+	if (other_slave) {
+		skb_push(skb, ETH_HLEN);
+		skb->dev = other_slave;
+		dev_queue_xmit(skb);
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+
+static struct packet_type hsr_pt __read_mostly = {
+	.type = htons(ETH_P_PRP),
+	.func = hsr_rcv,
+};
+
+static struct notifier_block hsr_nb = {
+	.notifier_call = hsr_netdev_notify,	/* Slave event notifications */
+};
+
+
+static int __init hsr_init(void)
+{
+	int res;
+
+	BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_TAGLEN);
+
+	dev_add_pack(&hsr_pt);
+
+	init_timer(&prune_timer);
+	prune_timer.function = prune_nodes_all;
+	prune_timer.data = 0;
+	prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
+	add_timer(&prune_timer);
+
+	register_netdevice_notifier(&hsr_nb);
+
+	res = hsr_netlink_init();
+
+	return res;
+}
+
+static void __exit hsr_exit(void)
+{
+	unregister_netdevice_notifier(&hsr_nb);
+	del_timer(&prune_timer);
+	hsr_netlink_exit();
+	dev_remove_pack(&hsr_pt);
+}
+
+module_init(hsr_init);
+module_exit(hsr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
new file mode 100644
index 000000000000..56fe060c0ab1
--- /dev/null
+++ b/net/hsr/hsr_main.h
@@ -0,0 +1,166 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef _HSR_PRIVATE_H
+#define _HSR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+
+
+/* Time constants as specified in the HSR specification (IEC-62439-3 2010)
+ * Table 8.
+ * All values in milliseconds.
+ */
+#define HSR_LIFE_CHECK_INTERVAL		 2000 /* ms */
+#define HSR_NODE_FORGET_TIME		60000 /* ms */
+#define HSR_ANNOUNCE_INTERVAL		  100 /* ms */
+
+
+/* By how much may slave1 and slave2 timestamps of latest received frame from
+ * each node differ before we notify of communication problem?
+ */
+#define MAX_SLAVE_DIFF			 3000 /* ms */
+
+
+/* How often shall we check for broken ring and remove node entries older than
+ * HSR_NODE_FORGET_TIME?
+ */
+#define PRUNE_PERIOD			 3000 /* ms */
+
+
+#define HSR_TLV_ANNOUNCE		   22
+#define HSR_TLV_LIFE_CHECK		   23
+
+
+/* HSR Tag.
+ * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB,
+ * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest,
+ * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr,
+ * encapsulated protocol } instead.
+ */
+#define HSR_TAGLEN	6
+
+/* Field names below as defined in the IEC:2010 standard for HSR. */
+struct hsr_tag {
+	__be16		path_and_LSDU_size;
+	__be16		sequence_nr;
+	__be16		encap_proto;
+} __packed;
+
+
+/* The helper functions below assumes that 'path' occupies the 4 most
+ * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
+ * equivalently, the 4 most significant bits of HSR tag byte 14).
+ *
+ * This is unclear in the IEC specification; its definition of MAC addresses
+ * indicates the spec is written with the least significant bit first (to the
+ * left). This, however, would mean that the LSDU field would be split in two
+ * with the path field in-between, which seems strange. I'm guessing the MAC
+ * address definition is in error.
+ */
+static inline u16 get_hsr_tag_path(struct hsr_tag *ht)
+{
+	return ntohs(ht->path_and_LSDU_size) >> 12;
+}
+
+static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht)
+{
+	return ntohs(ht->path_and_LSDU_size) & 0x0FFF;
+}
+
+static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
+{
+	ht->path_and_LSDU_size = htons(
+			(ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
+}
+
+static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size)
+{
+	ht->path_and_LSDU_size = htons(
+			(ntohs(ht->path_and_LSDU_size) & 0xF000) |
+			(LSDU_size & 0x0FFF));
+}
+
+struct hsr_ethhdr {
+	struct ethhdr	ethhdr;
+	struct hsr_tag	hsr_tag;
+} __packed;
+
+
+/* HSR Supervision Frame data types.
+ * Field names as defined in the IEC:2010 standard for HSR.
+ */
+struct hsr_sup_tag {
+	__be16		path_and_HSR_Ver;
+	__be16		sequence_nr;
+	__u8		HSR_TLV_Type;
+	__u8		HSR_TLV_Length;
+} __packed;
+
+struct hsr_sup_payload {
+	unsigned char	MacAddressA[ETH_ALEN];
+} __packed;
+
+static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst)
+{
+	return get_hsr_tag_path((struct hsr_tag *) hst);
+}
+
+static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst)
+{
+	return get_hsr_tag_LSDU_size((struct hsr_tag *) hst);
+}
+
+static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
+{
+	set_hsr_tag_path((struct hsr_tag *) hst, path);
+}
+
+static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
+{
+	set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
+}
+
+struct hsr_ethhdr_sp {
+	struct ethhdr		ethhdr;
+	struct hsr_sup_tag	hsr_sup;
+} __packed;
+
+
+enum hsr_dev_idx {
+	HSR_DEV_NONE = -1,
+	HSR_DEV_SLAVE_A = 0,
+	HSR_DEV_SLAVE_B,
+	HSR_DEV_MASTER,
+};
+#define HSR_MAX_SLAVE	(HSR_DEV_SLAVE_B + 1)
+#define HSR_MAX_DEV	(HSR_DEV_MASTER + 1)
+
+struct hsr_priv {
+	struct list_head	hsr_list;	/* List of hsr devices */
+	struct rcu_head		rcu_head;
+	struct net_device	*dev;
+	struct net_device	*slave[HSR_MAX_SLAVE];
+	struct list_head	node_db;	/* Other HSR nodes */
+	struct list_head	self_node_db;	/* MACs of slaves */
+	struct timer_list	announce_timer;	/* Supervision frame dispatch */
+	int announce_count;
+	u16 sequence_nr;
+	spinlock_t seqnr_lock;			/* locking for sequence_nr */
+	unsigned char		sup_multicast_addr[ETH_ALEN];
+};
+
+void register_hsr_master(struct hsr_priv *hsr_priv);
+void unregister_hsr_master(struct hsr_priv *hsr_priv);
+bool is_hsr_slave(struct net_device *dev);
+
+#endif /*  _HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
new file mode 100644
index 000000000000..4e66bf61f585
--- /dev/null
+++ b/net/hsr/hsr_netlink.c
@@ -0,0 +1,457 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * Routines for handling Netlink messages for HSR.
+ */
+
+#include "hsr_netlink.h"
+#include <linux/kernel.h>
+#include <net/rtnetlink.h>
+#include <net/genetlink.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+
+static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
+	[IFLA_HSR_SLAVE1]		= { .type = NLA_U32 },
+	[IFLA_HSR_SLAVE2]		= { .type = NLA_U32 },
+	[IFLA_HSR_MULTICAST_SPEC]	= { .type = NLA_U8 },
+};
+
+
+/* Here, it seems a netdevice has already been allocated for us, and the
+ * hsr_dev_setup routine has been executed. Nice!
+ */
+static int hsr_newlink(struct net *src_net, struct net_device *dev,
+		       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct net_device *link[2];
+	unsigned char multicast_spec;
+
+	if (!data[IFLA_HSR_SLAVE1]) {
+		netdev_info(dev, "IFLA_HSR_SLAVE1 missing!\n");
+		return -EINVAL;
+	}
+	link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1]));
+	if (!data[IFLA_HSR_SLAVE2]) {
+		netdev_info(dev, "IFLA_HSR_SLAVE2 missing!\n");
+		return -EINVAL;
+	}
+	link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2]));
+
+	if (!link[0] || !link[1])
+		return -ENODEV;
+	if (link[0] == link[1])
+		return -EINVAL;
+
+	if (!data[IFLA_HSR_MULTICAST_SPEC])
+		multicast_spec = 0;
+	else
+		multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
+
+	return hsr_dev_finalize(dev, link, multicast_spec);
+}
+
+static struct rtnl_link_ops hsr_link_ops __read_mostly = {
+	.kind		= "hsr",
+	.maxtype	= IFLA_HSR_MAX,
+	.policy		= hsr_policy,
+	.priv_size	= sizeof(struct hsr_priv),
+	.setup		= hsr_dev_setup,
+	.newlink	= hsr_newlink,
+};
+
+
+
+/* attribute policy */
+/* NLA_BINARY missing in libnl; use NLA_UNSPEC in userspace instead. */
+static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
+	[HSR_A_NODE_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
+	[HSR_A_NODE_ADDR_B] = { .type = NLA_BINARY, .len = ETH_ALEN },
+	[HSR_A_IFINDEX] = { .type = NLA_U32 },
+	[HSR_A_IF1_AGE] = { .type = NLA_U32 },
+	[HSR_A_IF2_AGE] = { .type = NLA_U32 },
+	[HSR_A_IF1_SEQ] = { .type = NLA_U16 },
+	[HSR_A_IF2_SEQ] = { .type = NLA_U16 },
+};
+
+static struct genl_family hsr_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "HSR",
+	.version = 1,
+	.maxattr = HSR_A_MAX,
+};
+
+static struct genl_multicast_group hsr_network_genl_mcgrp = {
+	.name = "hsr-network",
+};
+
+
+
+/* This is called if for some node with MAC address addr, we only get frames
+ * over one of the slave interfaces. This would indicate an open network ring
+ * (i.e. a link has failed somewhere).
+ */
+void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
+		      enum hsr_dev_idx dev_idx)
+{
+	struct sk_buff *skb;
+	void *msg_head;
+	int res;
+	int ifindex;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		goto fail;
+
+	msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_RING_ERROR);
+	if (!msg_head)
+		goto nla_put_failure;
+
+	res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+	if (res < 0)
+		goto nla_put_failure;
+
+	if (hsr_priv->slave[dev_idx])
+		ifindex = hsr_priv->slave[dev_idx]->ifindex;
+	else
+		ifindex = -1;
+	res = nla_put_u32(skb, HSR_A_IFINDEX, ifindex);
+	if (res < 0)
+		goto nla_put_failure;
+
+	genlmsg_end(skb, msg_head);
+	genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+
+	return;
+
+nla_put_failure:
+	kfree_skb(skb);
+
+fail:
+	netdev_warn(hsr_priv->dev, "Could not send HSR ring error message\n");
+}
+
+/* This is called when we haven't heard from the node with MAC address addr for
+ * some time (just before the node is removed from the node table/list).
+ */
+void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN])
+{
+	struct sk_buff *skb;
+	void *msg_head;
+	int res;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		goto fail;
+
+	msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_NODE_DOWN);
+	if (!msg_head)
+		goto nla_put_failure;
+
+
+	res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+	if (res < 0)
+		goto nla_put_failure;
+
+	genlmsg_end(skb, msg_head);
+	genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+
+	return;
+
+nla_put_failure:
+	kfree_skb(skb);
+
+fail:
+	netdev_warn(hsr_priv->dev, "Could not send HSR node down\n");
+}
+
+
+/* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table
+ * about the status of a specific node in the network, defined by its MAC
+ * address.
+ *
+ * Input: hsr ifindex, node mac address
+ * Output: hsr ifindex, node mac address (copied from request),
+ *	   age of latest frame from node over slave 1, slave 2 [ms]
+ */
+static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
+{
+	/* For receiving */
+	struct nlattr *na;
+	struct net_device *hsr_dev;
+
+	/* For sending */
+	struct sk_buff *skb_out;
+	void *msg_head;
+	struct hsr_priv *hsr_priv;
+	unsigned char hsr_node_addr_b[ETH_ALEN];
+	int hsr_node_if1_age;
+	u16 hsr_node_if1_seq;
+	int hsr_node_if2_age;
+	u16 hsr_node_if2_seq;
+	int addr_b_ifindex;
+	int res;
+
+	if (!info)
+		goto invalid;
+
+	na = info->attrs[HSR_A_IFINDEX];
+	if (!na)
+		goto invalid;
+	na = info->attrs[HSR_A_NODE_ADDR];
+	if (!na)
+		goto invalid;
+
+	hsr_dev = __dev_get_by_index(genl_info_net(info),
+					nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+	if (!hsr_dev)
+		goto invalid;
+	if (!is_hsr_master(hsr_dev))
+		goto invalid;
+
+
+	/* Send reply */
+
+	skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb_out) {
+		res = -ENOMEM;
+		goto fail;
+	}
+
+	msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+				info->snd_seq, &hsr_genl_family, 0,
+				HSR_C_SET_NODE_STATUS);
+	if (!msg_head) {
+		res = -ENOMEM;
+		goto nla_put_failure;
+	}
+
+	res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+	if (res < 0)
+		goto nla_put_failure;
+
+	hsr_priv = netdev_priv(hsr_dev);
+	res = hsr_get_node_data(hsr_priv,
+			(unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]),
+			hsr_node_addr_b,
+			&addr_b_ifindex,
+			&hsr_node_if1_age,
+			&hsr_node_if1_seq,
+			&hsr_node_if2_age,
+			&hsr_node_if2_seq);
+	if (res < 0)
+		goto fail;
+
+	res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN,
+					nla_data(info->attrs[HSR_A_NODE_ADDR]));
+	if (res < 0)
+		goto nla_put_failure;
+
+	if (addr_b_ifindex > -1) {
+		res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN,
+								hsr_node_addr_b);
+		if (res < 0)
+			goto nla_put_failure;
+
+		res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, addr_b_ifindex);
+		if (res < 0)
+			goto nla_put_failure;
+	}
+
+	res = nla_put_u32(skb_out, HSR_A_IF1_AGE, hsr_node_if1_age);
+	if (res < 0)
+		goto nla_put_failure;
+	res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
+	if (res < 0)
+		goto nla_put_failure;
+	if (hsr_priv->slave[0])
+		res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
+						hsr_priv->slave[0]->ifindex);
+	if (res < 0)
+		goto nla_put_failure;
+
+	res = nla_put_u32(skb_out, HSR_A_IF2_AGE, hsr_node_if2_age);
+	if (res < 0)
+		goto nla_put_failure;
+	res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
+	if (res < 0)
+		goto nla_put_failure;
+	if (hsr_priv->slave[1])
+		res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
+						hsr_priv->slave[1]->ifindex);
+
+	genlmsg_end(skb_out, msg_head);
+	genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+	return 0;
+
+invalid:
+	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+	return 0;
+
+nla_put_failure:
+	kfree_skb(skb_out);
+	/* Fall through */
+
+fail:
+	return res;
+}
+
+static struct genl_ops hsr_ops_get_node_status = {
+	.cmd = HSR_C_GET_NODE_STATUS,
+	.flags = 0,
+	.policy = hsr_genl_policy,
+	.doit = hsr_get_node_status,
+	.dumpit = NULL,
+};
+
+
+/* Get a list of MacAddressA of all nodes known to this node (other than self).
+ */
+static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
+{
+	/* For receiving */
+	struct nlattr *na;
+	struct net_device *hsr_dev;
+
+	/* For sending */
+	struct sk_buff *skb_out;
+	void *msg_head;
+	struct hsr_priv *hsr_priv;
+	void *pos;
+	unsigned char addr[ETH_ALEN];
+	int res;
+
+	if (!info)
+		goto invalid;
+
+	na = info->attrs[HSR_A_IFINDEX];
+	if (!na)
+		goto invalid;
+
+	hsr_dev = __dev_get_by_index(genl_info_net(info),
+				     nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+	if (!hsr_dev)
+		goto invalid;
+	if (!is_hsr_master(hsr_dev))
+		goto invalid;
+
+
+	/* Send reply */
+
+	skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb_out) {
+		res = -ENOMEM;
+		goto fail;
+	}
+
+	msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+				info->snd_seq, &hsr_genl_family, 0,
+				HSR_C_SET_NODE_LIST);
+	if (!msg_head) {
+		res = -ENOMEM;
+		goto nla_put_failure;
+	}
+
+	res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+	if (res < 0)
+		goto nla_put_failure;
+
+	hsr_priv = netdev_priv(hsr_dev);
+
+	rcu_read_lock();
+	pos = hsr_get_next_node(hsr_priv, NULL, addr);
+	while (pos) {
+		res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+		if (res < 0) {
+			rcu_read_unlock();
+			goto nla_put_failure;
+		}
+		pos = hsr_get_next_node(hsr_priv, pos, addr);
+	}
+	rcu_read_unlock();
+
+	genlmsg_end(skb_out, msg_head);
+	genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+	return 0;
+
+invalid:
+	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+	return 0;
+
+nla_put_failure:
+	kfree_skb(skb_out);
+	/* Fall through */
+
+fail:
+	return res;
+}
+
+
+static struct genl_ops hsr_ops_get_node_list = {
+	.cmd = HSR_C_GET_NODE_LIST,
+	.flags = 0,
+	.policy = hsr_genl_policy,
+	.doit = hsr_get_node_list,
+	.dumpit = NULL,
+};
+
+int __init hsr_netlink_init(void)
+{
+	int rc;
+
+	rc = rtnl_link_register(&hsr_link_ops);
+	if (rc)
+		goto fail_rtnl_link_register;
+
+	rc = genl_register_family(&hsr_genl_family);
+	if (rc)
+		goto fail_genl_register_family;
+
+	rc = genl_register_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+	if (rc)
+		goto fail_genl_register_ops;
+
+	rc = genl_register_ops(&hsr_genl_family, &hsr_ops_get_node_list);
+	if (rc)
+		goto fail_genl_register_ops_node_list;
+
+	rc = genl_register_mc_group(&hsr_genl_family, &hsr_network_genl_mcgrp);
+	if (rc)
+		goto fail_genl_register_mc_group;
+
+	return 0;
+
+fail_genl_register_mc_group:
+	genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_list);
+fail_genl_register_ops_node_list:
+	genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+fail_genl_register_ops:
+	genl_unregister_family(&hsr_genl_family);
+fail_genl_register_family:
+	rtnl_link_unregister(&hsr_link_ops);
+fail_rtnl_link_register:
+
+	return rc;
+}
+
+void __exit hsr_netlink_exit(void)
+{
+	genl_unregister_mc_group(&hsr_genl_family, &hsr_network_genl_mcgrp);
+	genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+	genl_unregister_family(&hsr_genl_family);
+
+	rtnl_link_unregister(&hsr_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("hsr");
diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h
new file mode 100644
index 000000000000..d4579dcc3c7d
--- /dev/null
+++ b/net/hsr/hsr_netlink.h
@@ -0,0 +1,30 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ *	2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef __HSR_NETLINK_H
+#define __HSR_NETLINK_H
+
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <uapi/linux/hsr_netlink.h>
+
+struct hsr_priv;
+
+int __init hsr_netlink_init(void);
+void __exit hsr_netlink_exit(void);
+
+void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
+		      int dev_idx);
+void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]);
+void hsr_nl_framedrop(int dropcount, int dev_idx);
+void hsr_nl_linkdown(int dev_idx);
+
+#endif /* __HSR_NETLINK_H */
-- 
cgit v1.2.3


From 482fc6094afad572a4ea1fd722e7b11ca72022a0 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 5 Nov 2013 02:24:17 +0100
Subject: ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE

Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery,
their sockets won't accept and install new path mtu information and they
will always use the interface mtu for outgoing packets. It is guaranteed
that the packet is not fragmented locally. But we won't set the DF-Flag
on the outgoing frames.

Florian Weimer had the idea to use this flag to ensure DNS servers are
never generating outgoing fragments. They may well be fragmented on the
path, but the server never stores or usees path mtu values, which could
well be forged in an attack.

(The root of the problem with path MTU discovery is that there is
no reliable way to authenticate ICMP Fragmentation Needed But DF Set
messages because they are sent from intermediate routers with their
source addresses, and the IMCP payload will not always contain sufficient
information to identify a flow.)

Recent research in the DNS community showed that it is possible to
implement an attack where DNS cache poisoning is feasible by spoofing
fragments. This work was done by Amir Herzberg and Haya Shulman:
<https://sites.google.com/site/hayashulman/files/fragmentation-poisoning.pdf>

This issue was previously discussed among the DNS community, e.g.
<http://www.ietf.org/mail-archive/web/dnsext/current/msg01204.html>,
without leading to fixes.

This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode
regarding local fragmentation with UFO/CORK" for the enforcement of the
non-fragmentable checks. If other users than ip_append_page/data should
use this semantic too, we have to add a new flag to IPCB(skb)->flags to
suppress local fragmentation and check for this in ip_finish_output.

Many thanks to Florian Weimer for the idea and feedback while implementing
this patch.

Cc: David S. Miller <davem@davemloft.net>
Suggested-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     | 16 ++++++++++++----
 include/uapi/linux/in.h |  5 +++++
 net/dccp/ipv4.c         |  1 +
 net/ipv4/ip_output.c    |  8 ++++----
 net/ipv4/ip_sockglue.c  |  2 +-
 net/ipv4/route.c        |  4 ++++
 net/ipv4/tcp_ipv4.c     |  1 +
 7 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/route.h b/include/net/route.h
index dd4ae0029fd8..f68c167280a7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -313,12 +313,20 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	return hoplimit;
 }
 
-static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+static inline bool ip_sk_accept_pmtu(const struct sock *sk)
 {
-	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+	return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
+}
 
-	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+static inline bool ip_sk_use_pmtu(const struct sock *sk)
+{
+	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+}
+
+static inline int ip_skb_dst_mtu(const struct sk_buff *skb)
+{
+	return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ?
+	       dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu;
 }
 
 #endif	/* _ROUTE_H */
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index f9e8e496ae5d..393c5de09d42 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -115,6 +115,11 @@ struct in_addr {
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
 #define IP_PMTUDISC_PROBE		3       /* Ignore dst pmtu      */
+/* Always use interface mtu (ignores dst pmtu) but don't set DF flag.
+ * Also incoming ICMP frag_needed notifications will be ignored on
+ * this socket to prevent accepting spoofed ones.
+ */
+#define IP_PMTUDISC_INTERFACE		4
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 720c36225ed9..d9f65fc66db5 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -174,6 +174,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    ip_sk_accept_pmtu(sk) &&
 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		dccp_sync_mss(sk, mtu);
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 51be64e18e32..912402752f2f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1037,7 +1037,6 @@ error:
 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 			 struct ipcm_cookie *ipc, struct rtable **rtp)
 {
-	struct inet_sock *inet = inet_sk(sk);
 	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
@@ -1063,8 +1062,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	 * We steal reference to this route, caller should not release it
 	 */
 	*rtp = NULL;
-	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
+	cork->fragsize = ip_sk_use_pmtu(sk) ?
+			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1315,7 +1314,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
 	    (skb->len <= dst_mtu(&rt->dst) &&
 	     ip_dont_fragment(sk, &rt->dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0626f2cb192e..3f858266fa7e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -627,7 +627,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->nodefrag = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d2d325382b13..f428935c50db 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1036,6 +1036,10 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	bool new = false;
 
 	bh_lock_sock(sk);
+
+	if (!ip_sk_accept_pmtu(sk))
+		goto out;
+
 	rt = (struct rtable *) __sk_dst_get(sk);
 
 	if (sock_owned_by_user(sk) || !rt) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 300ab2c93f29..14bba8a1c5a7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -288,6 +288,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    ip_sk_accept_pmtu(sk) &&
 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		tcp_sync_mss(sk, mtu);
 
-- 
cgit v1.2.3


From f83c3838b9146b891d0405d3a83660e8f6aed02f Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Date: Sun, 13 Oct 2013 18:05:23 -0300
Subject: mtd: Move major number definitions to major.h

This patch moves the char and block major number definitions
to major.h to be with the rest of the major numbers.
While doing this, include major.h in the files that need it.

Signed-off-by: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/devices/block2mtd.c | 1 +
 drivers/mtd/mtdblock.c          | 1 +
 drivers/mtd/mtdblock_ro.c       | 1 +
 drivers/mtd/mtdchar.c           | 1 +
 drivers/mtd/mtdcore.c           | 1 +
 drivers/mtd/mtdsuper.c          | 1 +
 drivers/mtd/ubi/build.c         | 1 +
 include/linux/mtd/mtd.h         | 3 ---
 include/uapi/linux/major.h      | 2 ++
 9 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index 5cb4c04726b2..d9fd87a4c8dc 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -20,6 +20,7 @@
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/major.h>
 
 /* Info for the block device */
 struct block2mtd_dev {
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 53884cc25916..485ea751c7f9 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -32,6 +32,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/blktrans.h>
 #include <linux/mutex.h>
+#include <linux/major.h>
 
 
 struct mtdblk_dev {
diff --git a/drivers/mtd/mtdblock_ro.c b/drivers/mtd/mtdblock_ro.c
index 70d27b4d632b..fb5dc89369de 100644
--- a/drivers/mtd/mtdblock_ro.c
+++ b/drivers/mtd/mtdblock_ro.c
@@ -24,6 +24,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/blktrans.h>
 #include <linux/module.h>
+#include <linux/major.h>
 
 static int mtdblock_readsect(struct mtd_blktrans_dev *dev,
 			      unsigned long block, char *buf)
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 684bfa39e4ee..9aa0c5e49c1d 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -32,6 +32,7 @@
 #include <linux/mount.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
+#include <linux/major.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 92311a56939f..7189089d87e3 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -37,6 +37,7 @@
 #include <linux/backing-dev.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/major.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 334da5f583c0..20c02a3b7417 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -17,6 +17,7 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/major.h>
 
 /*
  * compare superblocks to see if they're equivalent
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 315dcc6ec1f5..e05dc6298c1d 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/major.h>
 #include "ubi.h"
 
 /* Maximum length of the 'mtd=' parameter */
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 88409b813418..8cc0e2fb6894 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -29,9 +29,6 @@
 
 #include <asm/div64.h>
 
-#define MTD_CHAR_MAJOR 90
-#define MTD_BLOCK_MAJOR 31
-
 #define MTD_ERASE_PENDING	0x01
 #define MTD_ERASING		0x02
 #define MTD_ERASE_SUSPEND	0x04
diff --git a/include/uapi/linux/major.h b/include/uapi/linux/major.h
index 6a8ca98c9a96..620252e69b44 100644
--- a/include/uapi/linux/major.h
+++ b/include/uapi/linux/major.h
@@ -54,6 +54,7 @@
 #define ACSI_MAJOR		28
 #define AZTECH_CDROM_MAJOR	29
 #define FB_MAJOR		29   /* /dev/fb* framebuffers */
+#define MTD_BLOCK_MAJOR		31
 #define CM206_CDROM_MAJOR	32
 #define IDE2_MAJOR		33
 #define IDE3_MAJOR		34
@@ -105,6 +106,7 @@
 #define IDE6_MAJOR		88
 #define IDE7_MAJOR		89
 #define IDE8_MAJOR		90
+#define MTD_CHAR_MAJOR		90
 #define IDE9_MAJOR		91
 
 #define DASD_MAJOR		94
-- 
cgit v1.2.3


From a6cc0cfa72e0b6d9f2c8fd858aacc32313c4f272 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 6 Nov 2013 09:54:46 -0800
Subject: net: Add layer 2 hardware acceleration operations for macvlan devices

Add a operations structure that allows a network interface to export
the fact that it supports package forwarding in hardware between
physical interfaces and other mac layer devices assigned to it (such
as macvlans). This operaions structure can be used by virtual mac
devices to bypass software switching so that forwarding can be done
in hardware more efficiently.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c           | 36 +++++++++++++++++++++++++++++++++++-
 include/linux/if_macvlan.h      |  1 +
 include/linux/netdev_features.h |  2 ++
 include/linux/netdevice.h       | 36 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/if.h         |  1 +
 net/core/dev.c                  | 18 +++++++++++++-----
 net/core/ethtool.c              |  1 +
 net/sched/sch_generic.c         |  2 +-
 8 files changed, 89 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cc9845ec91c1..af4aaa5893ff 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -297,7 +297,13 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 	int ret;
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 
-	ret = macvlan_queue_xmit(skb, dev);
+	if (vlan->fwd_priv) {
+		skb->dev = vlan->lowerdev;
+		ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv);
+	} else {
+		ret = macvlan_queue_xmit(skb, dev);
+	}
+
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 		struct macvlan_pcpu_stats *pcpu_stats;
 
@@ -347,6 +353,21 @@ static int macvlan_open(struct net_device *dev)
 		goto hash_add;
 	}
 
+	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
+		vlan->fwd_priv =
+		      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
+
+		/* If we get a NULL pointer back, or if we get an error
+		 * then we should just fall through to the non accelerated path
+		 */
+		if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
+			vlan->fwd_priv = NULL;
+		} else {
+			dev->features &= ~NETIF_F_LLTX;
+			return 0;
+		}
+	}
+
 	err = -EBUSY;
 	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
 		goto out;
@@ -367,6 +388,11 @@ hash_add:
 del_unicast:
 	dev_uc_del(lowerdev, dev->dev_addr);
 out:
+	if (vlan->fwd_priv) {
+		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
+							   vlan->fwd_priv);
+		vlan->fwd_priv = NULL;
+	}
 	return err;
 }
 
@@ -375,6 +401,13 @@ static int macvlan_stop(struct net_device *dev)
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
+	if (vlan->fwd_priv) {
+		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
+							   vlan->fwd_priv);
+		vlan->fwd_priv = NULL;
+		return 0;
+	}
+
 	dev_uc_unsync(lowerdev, dev);
 	dev_mc_unsync(lowerdev, dev);
 
@@ -833,6 +866,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		goto destroy_port;
 
+	dev->priv_flags |= IFF_MACVLAN;
 	err = netdev_upper_dev_link(lowerdev, dev);
 	if (err)
 		goto destroy_port;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index ddd33fd5904d..c2702856295e 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -61,6 +61,7 @@ struct macvlan_dev {
 	struct hlist_node	hlist;
 	struct macvlan_port	*port;
 	struct net_device	*lowerdev;
+	void			*fwd_priv;
 	struct macvlan_pcpu_stats __percpu *pcpu_stats;
 
 	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index b05a4b501ab5..1005ebf17575 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -62,6 +62,7 @@ enum {
 	NETIF_F_HW_VLAN_STAG_TX_BIT,	/* Transmit VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
+	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -116,6 +117,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
+#define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b6f6efbcfc74..15fa01c9a3bf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -962,6 +962,25 @@ struct netdev_phys_port_id {
  *	Called by vxlan to notify the driver about a UDP port and socket
  *	address family that vxlan is not listening to anymore. The operation
  *	is protected by the vxlan_net->sock_lock.
+ *
+ * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
+ *				 struct net_device *dev)
+ *	Called by upper layer devices to accelerate switching or other
+ *	station functionality into hardware. 'pdev is the lowerdev
+ *	to use for the offload and 'dev' is the net device that will
+ *	back the offload. Returns a pointer to the private structure
+ *	the upper layer will maintain.
+ * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
+ *	Called by upper layer device to delete the station created
+ *	by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
+ *	the station and priv is the structure returned by the add
+ *	operation.
+ * netdev_tx_t (*ndo_dfwd_start_xmit)(struct sk_buff *skb,
+ *				      struct net_device *dev,
+ *				      void *priv);
+ *	Callback to use for xmit over the accelerated station. This
+ *	is used in place of ndo_start_xmit on accelerated net
+ *	devices.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1098,6 +1117,15 @@ struct net_device_ops {
 	void			(*ndo_del_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
 						      __be16 port);
+
+	void*			(*ndo_dfwd_add_station)(struct net_device *pdev,
+							struct net_device *dev);
+	void			(*ndo_dfwd_del_station)(struct net_device *pdev,
+							void *priv);
+
+	netdev_tx_t		(*ndo_dfwd_start_xmit) (struct sk_buff *skb,
+							struct net_device *dev,
+							void *priv);
 };
 
 /*
@@ -1195,6 +1223,7 @@ struct net_device {
 	/* Management operations */
 	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
+	const struct forwarding_accel_ops *fwd_ops;
 
 	/* Hardware header description */
 	const struct header_ops *header_ops;
@@ -2388,7 +2417,7 @@ int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq);
+			struct netdev_queue *txq, void *accel_priv);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 
 extern int		netdev_budget;
@@ -2967,6 +2996,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline bool netif_is_macvlan(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_MACVLAN;
+}
+
 static inline bool netif_is_bond_master(struct net_device *dev)
 {
 	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 1ec407b01e46..d758163b0e43 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -83,6 +83,7 @@
 #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
 #define IFF_LIVE_ADDR_CHANGE 0x100000	/* device supports hardware address
 					 * change when it's running */
+#define IFF_MACVLAN 0x200000		/* Macvlan device */
 
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0e6136546a8c..8ffc52e01ece 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2538,7 +2538,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb,
 }
 
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq)
+			struct netdev_queue *txq, void *accel_priv)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc = NETDEV_TX_OK;
@@ -2604,9 +2604,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			dev_queue_xmit_nit(skb, dev);
 
 		skb_len = skb->len;
-		rc = ops->ndo_start_xmit(skb, dev);
+		if (accel_priv)
+			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
+		else
+			rc = ops->ndo_start_xmit(skb, dev);
+
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
-		if (rc == NETDEV_TX_OK)
+		if (rc == NETDEV_TX_OK && txq)
 			txq_trans_update(txq);
 		return rc;
 	}
@@ -2622,7 +2626,10 @@ gso:
 			dev_queue_xmit_nit(nskb, dev);
 
 		skb_len = nskb->len;
-		rc = ops->ndo_start_xmit(nskb, dev);
+		if (accel_priv)
+			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
+		else
+			rc = ops->ndo_start_xmit(nskb, dev);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
@@ -2647,6 +2654,7 @@ out_kfree_skb:
 out:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
 
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
@@ -2854,7 +2862,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 			if (!netif_xmit_stopped(txq)) {
 				__this_cpu_inc(xmit_recursion);
-				rc = dev_hard_start_xmit(skb, dev, txq);
+				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
 				__this_cpu_dec(xmit_recursion);
 				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 862989898f61..30071dec287a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -96,6 +96,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_LOOPBACK_BIT] =         "loopback",
 	[NETIF_F_RXFCS_BIT] =            "rx-fcs",
 	[NETIF_F_RXALL_BIT] =            "rx-all",
+	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 };
 
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7fc899a943a8..922a09406ba7 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_stopped(txq))
-		ret = dev_hard_start_xmit(skb, dev, txq);
+		ret = dev_hard_start_xmit(skb, dev, txq, NULL);
 
 	HARD_TX_UNLOCK(dev, txq);
 
-- 
cgit v1.2.3


From a33c4a2663c19ac01e557d6b78806271eec2a150 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Fri, 8 Nov 2013 10:23:34 +0800
Subject: net_sched: tbf: support of 64bit rates

With psched_ratecfg_precompute(), tbf can deal with 64bit rates.
Add two new attributes so that tc can use them to break the 32bit
limit.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Suggested-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_tbf.c            | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f2624b549e61..307f293477e8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -171,6 +171,8 @@ enum {
 	TCA_TBF_PARMS,
 	TCA_TBF_RTAB,
 	TCA_TBF_PTAB,
+	TCA_TBF_RATE64,
+	TCA_TBF_PRATE64,
 	__TCA_TBF_MAX,
 };
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index b0571224f3c9..68f98595819c 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -266,20 +266,23 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
+	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
 };
 
 static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	int err;
 	struct tbf_sched_data *q = qdisc_priv(sch);
-	struct nlattr *tb[TCA_TBF_PTAB + 1];
+	struct nlattr *tb[TCA_TBF_MAX + 1];
 	struct tc_tbf_qopt *qopt;
 	struct qdisc_rate_table *rtab = NULL;
 	struct qdisc_rate_table *ptab = NULL;
 	struct Qdisc *child = NULL;
 	int max_size, n;
+	u64 rate64 = 0, prate64 = 0;
 
-	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
+	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
 	if (err < 0)
 		return err;
 
@@ -341,9 +344,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 
-	psched_ratecfg_precompute(&q->rate, &rtab->rate, 0);
+	if (tb[TCA_TBF_RATE64])
+		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
 	if (ptab) {
-		psched_ratecfg_precompute(&q->peak, &ptab->rate, 0);
+		if (tb[TCA_TBF_PRATE64])
+			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+		psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
 		q->peak_present = true;
 	} else {
 		q->peak_present = false;
@@ -402,6 +409,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
+	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if (q->peak_present &&
+	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 	return skb->len;
-- 
cgit v1.2.3


From 2c140a246dc0bc085b98eddde978060fcec1080c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 1 Nov 2013 18:27:41 -0400
Subject: dm: allow remove to be deferred

This patch allows the removal of an open device to be deferred until
it is closed.  (Previously such a removal attempt would fail.)

The deferred remove functionality is enabled by setting the flag
DM_DEFERRED_REMOVE in the ioctl structure on DM_DEV_REMOVE or
DM_REMOVE_ALL ioctl.

On return from DM_DEV_REMOVE, the flag DM_DEFERRED_REMOVE indicates if
the device was removed immediately or flagged to be removed on close -
if the flag is clear, the device was removed.

On return from DM_DEV_STATUS and other ioctls, the flag
DM_DEFERRED_REMOVE is set if the device is scheduled to be removed on
closure.

A device that is scheduled to be deleted can be revived using the
message "@cancel_deferred_remove". This message clears the
DMF_DEFERRED_REMOVE flag so that the device won't be deleted on close.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c         | 36 +++++++++++++++++++++++++++------
 drivers/md/dm.c               | 47 ++++++++++++++++++++++++++++++++++++++++---
 drivers/md/dm.h               | 13 +++++++++++-
 include/uapi/linux/dm-ioctl.h | 15 ++++++++++++--
 4 files changed, 99 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afe08146f73e..51521429fb59 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -57,7 +57,7 @@ struct vers_iter {
 static struct list_head _name_buckets[NUM_BUCKETS];
 static struct list_head _uuid_buckets[NUM_BUCKETS];
 
-static void dm_hash_remove_all(int keep_open_devices);
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
 
 /*
  * Guards access to both hash tables.
@@ -86,7 +86,7 @@ static int dm_hash_init(void)
 
 static void dm_hash_exit(void)
 {
-	dm_hash_remove_all(0);
+	dm_hash_remove_all(false, false, false);
 }
 
 /*-----------------------------------------------------------------
@@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
 	return table;
 }
 
-static void dm_hash_remove_all(int keep_open_devices)
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
 {
 	int i, dev_skipped;
 	struct hash_cell *hc;
@@ -293,7 +293,8 @@ retry:
 			md = hc->md;
 			dm_get(md);
 
-			if (keep_open_devices && dm_lock_for_deletion(md)) {
+			if (keep_open_devices &&
+			    dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
 				dm_put(md);
 				dev_skipped++;
 				continue;
@@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	return md;
 }
 
+void dm_deferred_remove(void)
+{
+	dm_hash_remove_all(true, false, true);
+}
+
 /*-----------------------------------------------------------------
  * Implementation of the ioctl commands
  *---------------------------------------------------------------*/
@@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
 
 static int remove_all(struct dm_ioctl *param, size_t param_size)
 {
-	dm_hash_remove_all(1);
+	dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	param->data_size = 0;
 	return 0;
 }
@@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	if (dm_suspended_md(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
+	if (dm_test_deferred_remove_flag(md))
+		param->flags |= DM_DEFERRED_REMOVE;
+
 	param->dev = huge_encode_dev(disk_devt(disk));
 
 	/*
@@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	/*
 	 * Ensure the device is not open and nothing further can open it.
 	 */
-	r = dm_lock_for_deletion(md);
+	r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	if (r) {
+		if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+			up_write(&_hash_lock);
+			dm_put(md);
+			return 0;
+		}
 		DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
 		up_write(&_hash_lock);
 		dm_put(md);
@@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 		dm_table_destroy(t);
 	}
 
+	param->flags &= ~DM_DEFERRED_REMOVE;
+
 	if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
 
@@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
 	if (**argv != '@')
 		return 2; /* no '@' prefix, deliver to target */
 
+	if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+		if (argc != 1) {
+			DMERR("Invalid arguments for @cancel_deferred_remove");
+			return -EINVAL;
+		}
+		return dm_cancel_deferred_remove(md);
+	}
+
 	r = dm_stats_message(md, argc, argv, result, maxlen);
 	if (r < 2)
 		return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3e26c7d1417..0704c523a76b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,11 @@ static unsigned int _major = 0;
 static DEFINE_IDR(_minor_idr);
 
 static DEFINE_SPINLOCK(_minor_lock);
+
+static void do_deferred_remove(struct work_struct *w);
+
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+
 /*
  * For bio-based dm.
  * One of these is allocated per bio.
@@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
 
 /*
  * A dummy definition to make RCU happy.
@@ -299,6 +305,8 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
+	flush_scheduled_work();
+
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
@@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 
 	spin_lock(&_minor_lock);
 
-	atomic_dec(&md->open_count);
+	if (atomic_dec_and_test(&md->open_count) &&
+	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+		schedule_work(&deferred_remove_work);
+
 	dm_put(md);
 
 	spin_unlock(&_minor_lock);
@@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md)
 /*
  * Guarantees nothing is using the device before it's deleted.
  */
-int dm_lock_for_deletion(struct mapped_device *md)
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 {
 	int r = 0;
 
 	spin_lock(&_minor_lock);
 
-	if (dm_open_count(md))
+	if (dm_open_count(md)) {
 		r = -EBUSY;
+		if (mark_deferred)
+			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+		r = -EEXIST;
 	else
 		set_bit(DMF_DELETING, &md->flags);
 
@@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md)
 	return r;
 }
 
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+	int r = 0;
+
+	spin_lock(&_minor_lock);
+
+	if (test_bit(DMF_DELETING, &md->flags))
+		r = -EBUSY;
+	else
+		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+
+	spin_unlock(&_minor_lock);
+
+	return r;
+}
+
+static void do_deferred_remove(struct work_struct *w)
+{
+	dm_deferred_remove();
+}
+
 sector_t dm_get_size(struct mapped_device *md)
 {
 	return get_capacity(md->disk);
@@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md)
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
+
 int dm_suspended(struct dm_target *ti)
 {
 	return dm_suspended_md(dm_table_get_md(ti->table));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1d1ad7b7e527..c57ba550f69e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -128,6 +128,16 @@ int dm_deleting_md(struct mapped_device *md);
  */
 int dm_suspended_md(struct mapped_device *md);
 
+/*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+
 /*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
@@ -158,7 +168,8 @@ void dm_stripe_exit(void);
 void dm_destroy(struct mapped_device *md);
 void dm_destroy_immediate(struct mapped_device *md);
 int dm_open_count(struct mapped_device *md);
-int dm_lock_for_deletion(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
 int dm_request_based(struct mapped_device *md);
 sector_t dm_get_size(struct mapped_device *md);
 struct dm_stats *dm_get_stats(struct mapped_device *md);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index f1e12bd40b3b..c8a4302093a3 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	26
+#define DM_VERSION_MINOR	27
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2013-08-15)"
+#define DM_VERSION_EXTRA	"-ioctl (2013-10-30)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -341,4 +341,15 @@ enum {
  */
 #define DM_DATA_OUT_FLAG		(1 << 16) /* Out */
 
+/*
+ * If set with DM_DEV_REMOVE or DM_REMOVE_ALL this indicates that if
+ * the device cannot be removed immediately because it is still in use
+ * it should instead be scheduled for removal when it gets closed.
+ *
+ * On return from DM_DEV_REMOVE, DM_DEV_STATUS or other ioctls, this
+ * flag indicates that the device is scheduled to be removed when it
+ * gets closed.
+ */
+#define DM_DEFERRED_REMOVE		(1 << 17) /* In/Out */
+
 #endif				/* _LINUX_DM_IOCTL_H */
-- 
cgit v1.2.3


From 81ab4190ac17df41686a37c97f701623276b652a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 31 Oct 2013 15:46:42 -0700
Subject: bcache: Pull on disk data structures out into a separate header

Now, the on disk data structures are in a header that can be exported to
userspace - and having them all centralized is nice too.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  | 244 +----------------------------
 drivers/md/bcache/bset.c    |   4 +-
 drivers/md/bcache/bset.h    |  31 ----
 drivers/md/bcache/btree.c   |   2 +-
 drivers/md/bcache/journal.c |   4 +-
 drivers/md/bcache/journal.h |  37 -----
 drivers/md/bcache/request.c |   9 +-
 drivers/md/bcache/super.c   |  13 +-
 drivers/md/bcache/util.h    |  10 --
 include/uapi/linux/bcache.h | 373 ++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 387 insertions(+), 340 deletions(-)
 create mode 100644 include/uapi/linux/bcache.h

(limited to 'include/uapi/linux')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index e32f6fd91755..045cb99f1ca6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
 
 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
 
+#include <linux/bcache.h>
 #include <linux/bio.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
 #define GC_MARK_METADATA	2
 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
 
-struct bkey {
-	uint64_t	high;
-	uint64_t	low;
-	uint64_t	ptr[];
-};
-
-/* Enough for a key with 6 pointers */
-#define BKEY_PAD		8
-
-#define BKEY_PADDED(key)					\
-	union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
-
-/* Version 0: Cache device
- * Version 1: Backing device
- * Version 2: Seed pointer into btree node checksum
- * Version 3: Cache device with new UUID format
- * Version 4: Backing device with data offset
- */
-#define BCACHE_SB_VERSION_CDEV			0
-#define BCACHE_SB_VERSION_BDEV			1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID	3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET	4
-#define BCACHE_SB_MAX_VERSION			4
-
-#define SB_SECTOR		8
-#define SB_SIZE			4096
-#define SB_LABEL_SIZE		32
-#define SB_JOURNAL_BUCKETS	256U
-/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
-#define MAX_CACHES_PER_SET	8
-
-#define BDEV_DATA_START_DEFAULT	16	/* sectors */
-
-struct cache_sb {
-	uint64_t		csum;
-	uint64_t		offset;	/* sector where this sb was written */
-	uint64_t		version;
-
-	uint8_t			magic[16];
-
-	uint8_t			uuid[16];
-	union {
-		uint8_t		set_uuid[16];
-		uint64_t	set_magic;
-	};
-	uint8_t			label[SB_LABEL_SIZE];
-
-	uint64_t		flags;
-	uint64_t		seq;
-	uint64_t		pad[8];
-
-	union {
-	struct {
-		/* Cache devices */
-		uint64_t	nbuckets;	/* device size */
-
-		uint16_t	block_size;	/* sectors */
-		uint16_t	bucket_size;	/* sectors */
-
-		uint16_t	nr_in_set;
-		uint16_t	nr_this_dev;
-	};
-	struct {
-		/* Backing devices */
-		uint64_t	data_offset;
-
-		/*
-		 * block_size from the cache device section is still used by
-		 * backing devices, so don't add anything here until we fix
-		 * things to not need it for backing devices anymore
-		 */
-	};
-	};
-
-	uint32_t		last_mount;	/* time_t */
-
-	uint16_t		first_bucket;
-	union {
-		uint16_t	njournal_buckets;
-		uint16_t	keys;
-	};
-	uint64_t		d[SB_JOURNAL_BUCKETS];	/* journal buckets */
-};
-
-BITMASK(CACHE_SYNC,		struct cache_sb, flags, 0, 1);
-BITMASK(CACHE_DISCARD,		struct cache_sb, flags, 1, 1);
-BITMASK(CACHE_REPLACEMENT,	struct cache_sb, flags, 2, 3);
-#define CACHE_REPLACEMENT_LRU	0U
-#define CACHE_REPLACEMENT_FIFO	1U
-#define CACHE_REPLACEMENT_RANDOM 2U
-
-BITMASK(BDEV_CACHE_MODE,	struct cache_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH	0U
-#define CACHE_MODE_WRITEBACK	1U
-#define CACHE_MODE_WRITEAROUND	2U
-#define CACHE_MODE_NONE		3U
-BITMASK(BDEV_STATE,		struct cache_sb, flags, 61, 2);
-#define BDEV_STATE_NONE		0U
-#define BDEV_STATE_CLEAN	1U
-#define BDEV_STATE_DIRTY	2U
-#define BDEV_STATE_STALE	3U
-
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_VERSION	1
-
-/*
- * This is the on disk format for btree nodes - a btree node on disk is a list
- * of these; within each set the keys are sorted
- */
-struct bset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
-/*
- * On disk format for priorities and gens - see super.c near prio_write() for
- * more.
- */
-struct prio_set {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		pad;
-
-	uint64_t		next_bucket;
-
-	struct bucket_disk {
-		uint16_t	prio;
-		uint8_t		gen;
-	} __attribute((packed)) data[];
-};
-
-struct uuid_entry {
-	union {
-		struct {
-			uint8_t		uuid[16];
-			uint8_t		label[32];
-			uint32_t	first_reg;
-			uint32_t	last_reg;
-			uint32_t	invalidated;
-
-			uint32_t	flags;
-			/* Size of flash only volumes */
-			uint64_t	sectors;
-		};
-
-		uint8_t	pad[128];
-	};
-};
-
-BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
-
 #include "journal.h"
 #include "stats.h"
 struct search;
@@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c)
 #endif
 }
 
-static inline bool SB_IS_BDEV(const struct cache_sb *sb)
-{
-	return sb->version == BCACHE_SB_VERSION_BDEV
-		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
 struct bbio {
 	unsigned		submit_time_us;
 	union {
@@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void)
 #define prio_buckets(c)					\
 	DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
 
-#define JSET_MAGIC		0x245235c1a3625032ULL
-#define PSET_MAGIC		0x6750e15f87337f91ULL
-#define BSET_MAGIC		0x90135c78b99e07f5ULL
-
-#define jset_magic(c)		((c)->sb.set_magic ^ JSET_MAGIC)
-#define pset_magic(c)		((c)->sb.set_magic ^ PSET_MAGIC)
-#define bset_magic(c)		((c)->sb.set_magic ^ BSET_MAGIC)
-
-/* Bkey fields: all units are in sectors */
-
-#define KEY_FIELD(name, field, offset, size)				\
-	BITMASK(name, struct bkey, field, offset, size)
-
-#define PTR_FIELD(name, offset, size)					\
-	static inline uint64_t name(const struct bkey *k, unsigned i)	\
-	{ return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }	\
-									\
-	static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
-	{								\
-		k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);	\
-		k->ptr[i] |= v << offset;				\
-	}
-
-KEY_FIELD(KEY_PTRS,	high, 60, 3)
-KEY_FIELD(HEADER_SIZE,	high, 58, 2)
-KEY_FIELD(KEY_CSUM,	high, 56, 2)
-KEY_FIELD(KEY_PINNED,	high, 55, 1)
-KEY_FIELD(KEY_DIRTY,	high, 36, 1)
-
-KEY_FIELD(KEY_SIZE,	high, 20, 16)
-KEY_FIELD(KEY_INODE,	high, 0,  20)
-
-/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
-
-static inline uint64_t KEY_OFFSET(const struct bkey *k)
-{
-	return k->low;
-}
-
-static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
-{
-	k->low = v;
-}
-
-PTR_FIELD(PTR_DEV,		51, 12)
-PTR_FIELD(PTR_OFFSET,		8,  43)
-PTR_FIELD(PTR_GEN,		0,  8)
-
-#define PTR_CHECK_DEV		((1 << 12) - 1)
-
-#define PTR(gen, offset, dev)						\
-	((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
-
 static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
 {
 	return s >> c->bucket_bits;
@@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 
 /* Btree key macros */
 
-/*
- * The high bit being set is a relic from when we used it to do binary
- * searches - it told you where a key started. It's not used anymore,
- * and can probably be safely dropped.
- */
-#define KEY(dev, sector, len)						\
-((struct bkey) {							\
-	.high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev),	\
-	.low = (sector)							\
-})
-
 static inline void bkey_init(struct bkey *k)
 {
-	*k = KEY(0, 0, 0);
+	*k = ZERO_KEY;
 }
 
-#define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
-#define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
-
-#define MAX_KEY_INODE		(~(~0 << 20))
-#define MAX_KEY_OFFSET		(((uint64_t) ~0) >> 1)
-#define MAX_KEY			KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
-
-#define ZERO_KEY		KEY(0, 0, 0)
-
 /*
  * This is used for various on disk data structures - cache_sb, prio_set, bset,
  * jset: The checksum is _always_ the first 8 bytes of these structs
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f7b5525ddafa..7b8713c66050 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b)
 	} else
 		get_random_bytes(&i->seq, sizeof(uint64_t));
 
-	i->magic	= bset_magic(b->c);
+	i->magic	= bset_magic(&b->c->sb);
 	i->version	= 0;
 	i->keys		= 0;
 
@@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 		 * memcpy()
 		 */
 
-		out->magic	= bset_magic(b->c);
+		out->magic	= bset_magic(&b->c->sb);
 		out->seq	= b->sets[0].data->seq;
 		out->version	= b->sets[0].data->version;
 		swap(out, b->sets[0].data);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 8a9305685b7e..5cd90565dfe2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
 
-static inline size_t bkey_u64s(const struct bkey *k)
-{
-	BUG_ON(KEY_CSUM(k) > 1);
-	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
-}
-
-static inline size_t bkey_bytes(const struct bkey *k)
-{
-	return bkey_u64s(k) * sizeof(uint64_t);
-}
-
-static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
-{
-	memcpy(dest, src, bkey_bytes(src));
-}
-
-static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
-{
-	if (!src)
-		src = &KEY(0, 0, 0);
-
-	SET_KEY_INODE(dest, KEY_INODE(src));
-	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
-}
-
-static inline struct bkey *bkey_next(const struct bkey *k)
-{
-	uint64_t *d = (void *) k;
-	return (struct bkey *) (d + bkey_u64s(k));
-}
-
 /* Keylists */
 
 struct keylist {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5aa4adadf1d..aba787d954e5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad magic";
-		if (i->magic != bset_magic(b->c))
+		if (i->magic != bset_magic(&b->c->sb))
 			goto err;
 
 		err = "bad checksum";
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 86de64a6bf26..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -74,7 +74,7 @@ reread:		left = ca->sb.bucket_size - offset;
 			struct list_head *where;
 			size_t blocks, bytes = set_bytes(j);
 
-			if (j->magic != jset_magic(ca->set))
+			if (j->magic != jset_magic(&ca->sb))
 				return ret;
 
 			if (bytes > left << 9)
@@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
 	for_each_cache(ca, c, i)
 		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 
-	w->data->magic		= jset_magic(c);
+	w->data->magic		= jset_magic(&c->sb);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
 	w->data->csum		= csum_set(w->data);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 5e9edb9ef376..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
  * nodes that are pinning the oldest journal entries first.
  */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_UUID	1
-#define BCACHE_JSET_VERSION		1
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	uint64_t		last_seq;
-
-	BKEY_PADDED(uuid_bucket);
-	BKEY_PADDED(btree_root);
-	uint16_t		btree_level;
-	uint16_t		pad[3];
-
-	uint64_t		prio_bucket[MAX_CACHES_PER_SET];
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index cf7850a7592c..932300f18973 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl)
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
 
 	while (bio_sectors(bio)) {
-		unsigned len = min(bio_sectors(bio), 1U << 14);
+		unsigned sectors = min(bio_sectors(bio),
+				       1U << (KEY_SIZE_BITS - 1));
 
 		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
-		bio->bi_sector	+= len;
-		bio->bi_size	-= len << 9;
+		bio->bi_sector	+= sectors;
+		bio->bi_size	-= sectors << 9;
 
 		bch_keylist_add(&op->insert_keys,
-				&KEY(op->inode, bio->bi_sector, len));
+				&KEY(op->inode, bio->bi_sector, sectors));
 	}
 
 	op->insert_data_done = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a314c771263f..c67d19a8913d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = {
 	NULL
 };
 
-struct uuid_entry_v0 {
-	uint8_t		uuid[16];
-	uint8_t		label[32];
-	uint32_t	first_reg;
-	uint32_t	last_reg;
-	uint32_t	invalidated;
-	uint32_t	pad;
-};
-
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca)
 		}
 
 		p->next_bucket	= ca->prio_buckets[i + 1];
-		p->magic	= pset_magic(ca);
+		p->magic	= pset_magic(&ca->sb);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
 		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
@@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 				pr_warn("bad csum reading priorities");
 
-			if (p->magic != pset_magic(ca))
+			if (p->magic != pset_magic(&ca->sb))
 				pr_warn("bad magic reading priorities");
 
 			bucket = p->next_bucket;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..38ae7a4ce928 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -27,16 +27,6 @@ struct closure;
 
 #endif
 
-#define BITMASK(name, type, field, offset, size)		\
-static inline uint64_t name(const type *k)			\
-{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
-								\
-static inline void SET_##name(type *k, uint64_t v)		\
-{								\
-	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
-	k->field |= v << offset;				\
-}
-
 #define DECLARE_HEAP(type, name)					\
 	struct {							\
 		size_t size, used;					\
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
+#ifndef _LINUX_BCACHE_H
+#define _LINUX_BCACHE_H
+
+/*
+ * Bcache on disk data structures
+ */
+
+#include <asm/types.h>
+
+#define BITMASK(name, type, field, offset, size)		\
+static inline __u64 name(const type *k)				\
+{ return (k->field >> offset) & ~(~0ULL << size); }		\
+								\
+static inline void SET_##name(type *k, __u64 v)			\
+{								\
+	k->field &= ~(~(~0ULL << size) << offset);		\
+	k->field |= (v & ~(~0ULL << size)) << offset;		\
+}
+
+/* Btree keys - all units are in sectors */
+
+struct bkey {
+	__u64	high;
+	__u64	low;
+	__u64	ptr[];
+};
+
+#define KEY_FIELD(name, field, offset, size)				\
+	BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size)					\
+static inline __u64 name(const struct bkey *k, unsigned i)		\
+{ return (k->ptr[i] >> offset) & ~(~0ULL << size); }			\
+									\
+static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\
+{									\
+	k->ptr[i] &= ~(~(~0ULL << size) << offset);			\
+	k->ptr[i] |= (v & ~(~0ULL << size)) << offset;			\
+}
+
+#define KEY_SIZE_BITS		16
+
+KEY_FIELD(KEY_PTRS,	high, 60, 3)
+KEY_FIELD(HEADER_SIZE,	high, 58, 2)
+KEY_FIELD(KEY_CSUM,	high, 56, 2)
+KEY_FIELD(KEY_PINNED,	high, 55, 1)
+KEY_FIELD(KEY_DIRTY,	high, 36, 1)
+
+KEY_FIELD(KEY_SIZE,	high, 20, KEY_SIZE_BITS)
+KEY_FIELD(KEY_INODE,	high, 0,  20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline __u64 KEY_OFFSET(const struct bkey *k)
+{
+	return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
+{
+	k->low = v;
+}
+
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(inode, offset, size)					\
+((struct bkey) {							\
+	.high = (1ULL << 63) | ((__u64) (size) << 20) | (inode),	\
+	.low = (offset)							\
+})
+
+#define ZERO_KEY			KEY(0, 0, 0)
+
+#define MAX_KEY_INODE			(~(~0 << 20))
+#define MAX_KEY_OFFSET			(~0ULL >> 1)
+#define MAX_KEY				KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
+
+#define KEY_START(k)			(KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k)			KEY(KEY_INODE(k), KEY_START(k), 0)
+
+#define PTR_DEV_BITS			12
+
+PTR_FIELD(PTR_DEV,			51, PTR_DEV_BITS)
+PTR_FIELD(PTR_OFFSET,			8,  43)
+PTR_FIELD(PTR_GEN,			0,  8)
+
+#define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
+
+#define PTR(gen, offset, dev)						\
+	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
+
+/* Bkey utility code */
+
+static inline unsigned long bkey_u64s(const struct bkey *k)
+{
+	return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
+}
+
+static inline unsigned long bkey_bytes(const struct bkey *k)
+{
+	return bkey_u64s(k) * sizeof(__u64);
+}
+
+#define bkey_copy(_dest, _src)	memcpy(_dest, _src, bkey_bytes(_src))
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+	SET_KEY_INODE(dest, KEY_INODE(src));
+	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
+}
+
+static inline struct bkey *bkey_next(const struct bkey *k)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + bkey_u64s(k));
+}
+
+static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + nr_keys);
+}
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD		8
+
+#define BKEY_PADDED(key)					\
+	union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
+
+/* Superblock */
+
+/* Version 0: Cache device
+ * Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
+ */
+#define BCACHE_SB_VERSION_CDEV		0
+#define BCACHE_SB_VERSION_BDEV		1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
+#define BCACHE_SB_MAX_VERSION		4
+
+#define SB_SECTOR			8
+#define SB_SIZE				4096
+#define SB_LABEL_SIZE			32
+#define SB_JOURNAL_BUCKETS		256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET		8
+
+#define BDEV_DATA_START_DEFAULT		16	/* sectors */
+
+struct cache_sb {
+	__u64			csum;
+	__u64			offset;	/* sector where this sb was written */
+	__u64			version;
+
+	__u8			magic[16];
+
+	__u8			uuid[16];
+	union {
+		__u8		set_uuid[16];
+		__u64		set_magic;
+	};
+	__u8			label[SB_LABEL_SIZE];
+
+	__u64			flags;
+	__u64			seq;
+	__u64			pad[8];
+
+	union {
+	struct {
+		/* Cache devices */
+		__u64		nbuckets;	/* device size */
+
+		__u16		block_size;	/* sectors */
+		__u16		bucket_size;	/* sectors */
+
+		__u16		nr_in_set;
+		__u16		nr_this_dev;
+	};
+	struct {
+		/* Backing devices */
+		__u64		data_offset;
+
+		/*
+		 * block_size from the cache device section is still used by
+		 * backing devices, so don't add anything here until we fix
+		 * things to not need it for backing devices anymore
+		 */
+	};
+	};
+
+	__u32			last_mount;	/* time_t */
+
+	__u16			first_bucket;
+	union {
+		__u16		njournal_buckets;
+		__u16		keys;
+	};
+	__u64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+};
+
+static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+	return sb->version == BCACHE_SB_VERSION_BDEV
+		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
+BITMASK(CACHE_SYNC,			struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD,			struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT,		struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU		0U
+#define CACHE_REPLACEMENT_FIFO		1U
+#define CACHE_REPLACEMENT_RANDOM	2U
+
+BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH		0U
+#define CACHE_MODE_WRITEBACK		1U
+#define CACHE_MODE_WRITEAROUND		2U
+#define CACHE_MODE_NONE			3U
+BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE			0U
+#define BDEV_STATE_CLEAN		1U
+#define BDEV_STATE_DIRTY		2U
+#define BDEV_STATE_STALE		3U
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define JSET_MAGIC			0x245235c1a3625032ULL
+#define PSET_MAGIC			0x6750e15f87337f91ULL
+#define BSET_MAGIC			0x90135c78b99e07f5ULL
+
+static inline __u64 jset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ JSET_MAGIC;
+}
+
+static inline __u64 pset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ PSET_MAGIC;
+}
+
+static inline __u64 bset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ BSET_MAGIC;
+}
+
+/*
+ * Journal
+ *
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+
+#define BCACHE_JSET_VERSION_UUIDv1	1
+#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
+#define BCACHE_JSET_VERSION		1
+
+struct jset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	__u64			last_seq;
+
+	BKEY_PADDED(uuid_bucket);
+	BKEY_PADDED(btree_root);
+	__u16			btree_level;
+	__u16			pad[3];
+
+	__u64			prio_bucket[MAX_CACHES_PER_SET];
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* Bucket prios/gens */
+
+struct prio_set {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			pad;
+
+	__u64			next_bucket;
+
+	struct bucket_disk {
+		__u16		prio;
+		__u8		gen;
+	} __attribute((packed)) data[];
+};
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry {
+	union {
+		struct {
+			__u8	uuid[16];
+			__u8	label[32];
+			__u32	first_reg;
+			__u32	last_reg;
+			__u32	invalidated;
+
+			__u32	flags;
+			/* Size of flash only volumes */
+			__u64	sectors;
+		};
+
+		__u8		pad[128];
+	};
+};
+
+BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
+
+/* Btree nodes */
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_CSUM		1
+#define BCACHE_BSET_VERSION		1
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* OBSOLETE */
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry_v0 {
+	__u8		uuid[16];
+	__u8		label[32];
+	__u32		first_reg;
+	__u32		last_reg;
+	__u32		invalidated;
+	__u32		pad;
+};
+
+#endif /* _LINUX_BCACHE_H */
-- 
cgit v1.2.3


From 38e9efcdb33270b4da72143d8e7ca4dcf7f0989b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 11 Nov 2013 12:20:35 +0100
Subject: random32: move rnd_state to linux/random.h

struct rnd_state got mistakenly pulled into uapi header. It is not
used anywhere and does also not belong there!

Commit 5960164fde ("lib/random32: export pseudo-random number
generator for modules"), the last commit on rnd_state before it
got moved to uapi, says:

  This patch moves the definition of struct rnd_state and the inline
  __seed() function to linux/random.h.  It renames the static __random32()
  function to prandom32() and exports it for use in modules.

Hence, the structure was moved from lib/random32.c to linux/random.h
so that it can be used within modules (FCoE-related code in this
case), but not from user space. However, it seems to have been
mistakenly moved to uapi header through the uapi script. Since no-one
should make use of it from the linux headers, move the structure back
to the kernel for internal use, so that it can be modified on demand.

Joint work with Hannes Frederic Sowa.

Cc: Joe Eykholt <jeykholt@cisco.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/random.h      | 4 ++++
 include/uapi/linux/random.h | 7 -------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index 5117ae348fe8..8ef0b70bd1f9 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -31,6 +31,10 @@ void prandom_bytes(void *buf, int nbytes);
 void prandom_seed(u32 seed);
 void prandom_reseed_late(void);
 
+struct rnd_state {
+	__u32 s1, s2, s3;
+};
+
 u32 prandom_u32_state(struct rnd_state *);
 void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
 
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index 7471b5b3b8ba..fff3528a078f 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -40,11 +40,4 @@ struct rand_pool_info {
 	__u32	buf[0];
 };
 
-struct rnd_state {
-	__u32 s1, s2, s3;
-};
-
-/* Exported functions */
-
-
 #endif /* _UAPI_LINUX_RANDOM_H */
-- 
cgit v1.2.3


From 294e30fee35d3151d100cfe59e839c2dbc16a374 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 9 Oct 2013 12:00:56 -0400
Subject: Btrfs: add tests for find_lock_delalloc_range

So both Liu and I made huge messes of find_lock_delalloc_range trying to fix
stuff, me first by fixing extent size, then him by fixing something I broke and
then me again telling him to fix it a different way.  So this is obviously a
candidate for some testing.  This patch adds a pseudo fs so we can allocate fake
inodes for tests that need an inode or pages.  Then it addes a bunch of tests to
make sure find_lock_delalloc_range is acting the way it is supposed to.  With
this patch and all of our previous patches to find_lock_delalloc_range I am sure
it is working as expected now.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile                |   2 +-
 fs/btrfs/ctree.h                 |   6 +
 fs/btrfs/extent_io.c             |   9 +-
 fs/btrfs/extent_io.h             |   6 +
 fs/btrfs/super.c                 |  14 +-
 fs/btrfs/tests/btrfs-tests.c     |  68 ++++++++++
 fs/btrfs/tests/btrfs-tests.h     |  15 +++
 fs/btrfs/tests/extent-io-tests.c | 276 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h       |   2 +-
 9 files changed, 389 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/tests/btrfs-tests.c
 create mode 100644 fs/btrfs/tests/extent-io-tests.c

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4c7dfbfaa3b3..cac4f2d001a3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -15,4 +15,4 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
-	tests/extent-buffer-tests.o
+	tests/extent-buffer-tests.o tests/btrfs-tests.o tests/extent-io-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ca15a84cc5a..2f398062b942 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -47,6 +47,12 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2bf6f46fae44..c10291cc4fd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1598,11 +1598,10 @@ done:
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
-					     struct extent_io_tree *tree,
-					     struct page *locked_page,
-					     u64 *start, u64 *end,
-					     u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+				    struct extent_io_tree *tree,
+				    struct page *locked_page, u64 *start,
+				    u64 *end, u64 max_bytes)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6dbc645f1f3d..f98602eac808 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -345,4 +345,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+				      struct extent_io_tree *tree,
+				      struct page *locked_page, u64 *start,
+				      u64 *end, u64 max_bytes);
+#endif
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0e398657d759..78041e34d15a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1791,10 +1791,20 @@ static int btrfs_run_sanity_tests(void)
 {
 	int ret;
 
-	ret = btrfs_test_free_space_cache();
+	ret = btrfs_init_test_fs();
 	if (ret)
 		return ret;
-	return btrfs_test_extent_buffer_operations();
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_buffer_operations();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_io();
+out:
+	btrfs_destroy_test_fs();
+	return ret;
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000000..697d527377c1
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *data)
+{
+	return mount_pseudo(fs_type, "btrfs_test:", NULL, NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+	.name		= "btrfs_test_fs",
+	.mount		= btrfs_test_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+	return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+	int ret;
+
+	ret = register_filesystem(&test_type);
+	if (ret) {
+		printk(KERN_ERR "btrfs: cannot register test file system\n");
+		return ret;
+	}
+
+	test_mnt = kern_mount(&test_type);
+	if (IS_ERR(test_mnt)) {
+		printk(KERN_ERR "btrfs: cannot mount test file system\n");
+		unregister_filesystem(&test_type);
+		return ret;
+	}
+	return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+	kern_unmount(test_mnt);
+	unregister_filesystem(&test_type);
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 04f2cd2ca568..e935fe5291e7 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -25,6 +25,10 @@
 
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
@@ -34,6 +38,17 @@ static inline int btrfs_test_extent_buffer_operations(void)
 {
 	return 0;
 }
+static inline int btrfs_init_test_fs(void)
+{
+	return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000000..7e99c2f98dd0
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK		(1 << 0)
+#define PROCESS_RELEASE		(1 << 1)
+#define PROCESS_TEST_LOCKED	(1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+				       unsigned long flags)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int count = 0;
+	int loops = 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (flags & PROCESS_TEST_LOCKED &&
+			    !PageLocked(pages[i]))
+				count++;
+			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+			if (flags & PROCESS_RELEASE)
+				page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+		loops++;
+		if (loops > 100000) {
+			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			break;
+		}
+	}
+	return count;
+}
+
+static int test_find_delalloc(void)
+{
+	struct inode *inode;
+	struct extent_io_tree tmp;
+	struct page *page;
+	struct page *locked_page = NULL;
+	unsigned long index = 0;
+	u64 total_dirty = 256 * 1024 * 1024;
+	u64 max_bytes = 128 * 1024 * 1024;
+	u64 start, end, test_start;
+	u64 found;
+	int ret = -EINVAL;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Failed to allocate test inode\n");
+		return -ENOMEM;
+	}
+
+	extent_io_tree_init(&tmp, &inode->i_data);
+
+	/*
+	 * First go through and create and mark all of our pages dirty, we pin
+	 * everything to make sure our pages don't get evicted and screw up our
+	 * test.
+	 */
+	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			test_msg("Failed to allocate test page\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		SetPageDirty(page);
+		if (index) {
+			unlock_page(page);
+		} else {
+			page_cache_get(page);
+			locked_page = page;
+		}
+	}
+
+	/* Test this scenario
+	 * |--- delalloc ---|
+	 * |---  search  ---|
+	 */
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	start = 0;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Should have found at least one delalloc\n");
+		goto out_bits;
+	}
+	if (start != 0 || end != 4095) {
+		test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+			 start, end);
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	unlock_page(locked_page);
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 *
+	 * |--- delalloc ---|
+	 *           |--- search ---|
+	 */
+	test_start = 64 * 1024 * 1024;
+	locked_page = find_lock_page(inode->i_mapping,
+				     test_start >> PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Couldn't find the locked page\n");
+		goto out_bits;
+	}
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Couldn't find delalloc in our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != max_bytes - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("There were unlocked pages in the range\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	/* locked_page was unlocked above */
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 * |--- delalloc ---|
+	 *                    |--- search ---|
+	 */
+	test_start = max_bytes + 4096;
+	locked_page = find_lock_page(inode->i_mapping, test_start >>
+				     PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Could'nt find the locked page\n");
+		goto out_bits;
+	}
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (found) {
+		test_msg("Found range when we shouldn't have\n");
+		goto out_bits;
+	}
+	if (end != (u64)-1) {
+		test_msg("Did not return the proper end offset\n");
+		goto out_bits;
+	}
+
+	/*
+	 * Test this scenario
+	 * [------- delalloc -------|
+	 * [max_bytes]|-- search--|
+	 *
+	 * We are re-using our test_start from above since it works out well.
+	 */
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != total_dirty - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, total_dirty - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+
+	/*
+	 * Now to test where we run into a page that is no longer dirty in the
+	 * range we want to find.
+	 */
+	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+			     >> PAGE_CACHE_SHIFT);
+	if (!page) {
+		test_msg("Couldn't find our page\n");
+		goto out_bits;
+	}
+	ClearPageDirty(page);
+	page_cache_release(page);
+
+	/* We unlocked it in the previous test */
+	lock_page(locked_page);
+	start = test_start;
+	end = 0;
+	/*
+	 * Currently if we fail to find dirty pages in the delalloc range we
+	 * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+	 * this changes at any point in the future we will need to fix this
+	 * tests expected behavior.
+	 */
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+			       PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	ret = 0;
+out_bits:
+	clear_extent_bits(&tmp, 0, total_dirty - 1,
+			  (unsigned long)-1, GFP_NOFS);
+out:
+	if (locked_page)
+		page_cache_release(locked_page);
+	process_page_range(inode, 0, total_dirty - 1,
+			   PROCESS_UNLOCK | PROCESS_RELEASE);
+	iput(inode);
+	return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+	test_msg("Running find delalloc tests\n");
+	return test_find_delalloc();
+}
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 2944278a8ba7..77c60311a6c6 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -71,6 +71,6 @@
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
 #define MTD_INODE_FS_MAGIC      0x11307854
 #define ANON_INODE_FS_MAGIC	0x09041934
-
+#define BTRFS_TEST_MAGIC	0x73727279
 
 #endif /* __LINUX_MAGIC_H__ */
-- 
cgit v1.2.3


From f7625980f5820edd1a73536e1a03bcbc1f889fec Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 14 Nov 2013 11:28:18 -0700
Subject: PCI: Fix whitespace, capitalization, and spelling errors

Fix whitespace, capitalization, and spelling errors.  No functional change.
I know "busses" is not an error, but "buses" was more common, so I used it
consistently.

Signed-off-by: Marta Rybczynska <rybczynska@gmail.com> (pci_reset_bridge_secondary_bus())
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/ats.c                       |   2 +-
 drivers/pci/host/pci-tegra.c            |  12 ++--
 drivers/pci/host/pcie-designware.c      |   2 +-
 drivers/pci/hotplug/Kconfig             |   4 +-
 drivers/pci/hotplug/Makefile            |   2 +-
 drivers/pci/hotplug/acpiphp_core.c      |  12 ++--
 drivers/pci/hotplug/acpiphp_glue.c      |   2 +-
 drivers/pci/hotplug/acpiphp_ibm.c       |  14 ++--
 drivers/pci/hotplug/cpci_hotplug_core.c |   2 +-
 drivers/pci/hotplug/cpci_hotplug_pci.c  |   2 +-
 drivers/pci/hotplug/cpcihp_generic.c    |  20 +++---
 drivers/pci/hotplug/cpcihp_zt5550.c     |  22 +++----
 drivers/pci/hotplug/cpcihp_zt5550.h     |  18 +++---
 drivers/pci/hotplug/cpqphp_core.c       |   4 +-
 drivers/pci/hotplug/cpqphp_ctrl.c       |  10 +--
 drivers/pci/hotplug/cpqphp_pci.c        |   5 +-
 drivers/pci/hotplug/ibmphp.h            |  12 ++--
 drivers/pci/hotplug/ibmphp_core.c       | 109 ++++++++++++++++----------------
 drivers/pci/hotplug/ibmphp_ebda.c       | 103 +++++++++++++++---------------
 drivers/pci/hotplug/ibmphp_hpc.c        |  24 +++----
 drivers/pci/hotplug/ibmphp_pci.c        |  71 ++++++++++-----------
 drivers/pci/hotplug/ibmphp_res.c        |  76 +++++++++++-----------
 drivers/pci/hotplug/pci_hotplug_core.c  |   8 +--
 drivers/pci/hotplug/pciehp.h            |   2 +-
 drivers/pci/hotplug/pciehp_acpi.c       |   2 +-
 drivers/pci/hotplug/pciehp_core.c       |   4 +-
 drivers/pci/hotplug/pciehp_hpc.c        |   6 +-
 drivers/pci/hotplug/pcihp_skeleton.c    |   8 +--
 drivers/pci/hotplug/rpadlpar_core.c     |   2 +-
 drivers/pci/hotplug/rpaphp.h            |   6 +-
 drivers/pci/hotplug/rpaphp_core.c       |   8 +--
 drivers/pci/hotplug/rpaphp_pci.c        |   3 +-
 drivers/pci/hotplug/rpaphp_slot.c       |  19 +++---
 drivers/pci/hotplug/shpchp.h            |   8 +--
 drivers/pci/hotplug/shpchp_core.c       |  10 +--
 drivers/pci/hotplug/shpchp_hpc.c        |   2 +-
 drivers/pci/iov.c                       |   2 +-
 drivers/pci/irq.c                       |   2 +-
 drivers/pci/msi.c                       |   2 +-
 drivers/pci/pci-acpi.c                  |   2 +-
 drivers/pci/pci-driver.c                |  16 ++---
 drivers/pci/pci-stub.c                  |   4 +-
 drivers/pci/pci-sysfs.c                 |  28 ++++----
 drivers/pci/pci.c                       |  46 +++++++-------
 drivers/pci/pcie/aer/aerdrv_core.c      |   2 +-
 drivers/pci/pcie/aspm.c                 |   2 +-
 drivers/pci/pcie/pme.c                  |   4 +-
 drivers/pci/pcie/portdrv.h              |   2 +-
 drivers/pci/pcie/portdrv_bus.c          |   4 +-
 drivers/pci/pcie/portdrv_core.c         |   2 +-
 drivers/pci/pcie/portdrv_pci.c          |   6 +-
 drivers/pci/probe.c                     |  10 +--
 drivers/pci/proc.c                      |   2 +-
 drivers/pci/quirks.c                    | 104 +++++++++++++++---------------
 drivers/pci/remove.c                    |   2 +-
 drivers/pci/search.c                    |  12 ++--
 drivers/pci/setup-bus.c                 |  18 +++---
 drivers/pci/setup-res.c                 |   2 +-
 drivers/pci/slot.c                      |   2 +-
 drivers/pci/syscall.c                   |   2 +-
 include/linux/msi.h                     |  10 +--
 include/linux/pci.h                     |  55 ++++++++--------
 include/linux/pci_hotplug.h             |   5 +-
 include/linux/pcieport_if.h             |   2 +-
 include/uapi/linux/pci_regs.h           |  72 ++++++++++-----------
 65 files changed, 518 insertions(+), 520 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 95655d7c0d0b..e52d7ffa38b9 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -410,7 +410,7 @@ EXPORT_SYMBOL_GPL(pci_disable_pasid);
  * Otherwise is returns a bitmask with supported features. Current
  * features reported are:
  * PCI_PASID_CAP_EXEC - Execute permission supported
- * PCI_PASID_CAP_PRIV - Priviledged mode supported
+ * PCI_PASID_CAP_PRIV - Privileged mode supported
  */
 int pci_pasid_features(struct pci_dev *pdev)
 {
diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c
index 7c4f38dd42ba..0afbbbc55c81 100644
--- a/drivers/pci/host/pci-tegra.c
+++ b/drivers/pci/host/pci-tegra.c
@@ -249,7 +249,7 @@ struct tegra_pcie {
 	void __iomem *afi;
 	int irq;
 
-	struct list_head busses;
+	struct list_head buses;
 	struct resource *cs;
 
 	struct resource io;
@@ -399,14 +399,14 @@ free:
 
 /*
  * Look up a virtual address mapping for the specified bus number. If no such
- * mapping existis, try to create one.
+ * mapping exists, try to create one.
  */
 static void __iomem *tegra_pcie_bus_map(struct tegra_pcie *pcie,
 					unsigned int busnr)
 {
 	struct tegra_pcie_bus *bus;
 
-	list_for_each_entry(bus, &pcie->busses, list)
+	list_for_each_entry(bus, &pcie->buses, list)
 		if (bus->nr == busnr)
 			return (void __iomem *)bus->area->addr;
 
@@ -414,7 +414,7 @@ static void __iomem *tegra_pcie_bus_map(struct tegra_pcie *pcie,
 	if (IS_ERR(bus))
 		return NULL;
 
-	list_add_tail(&bus->list, &pcie->busses);
+	list_add_tail(&bus->list, &pcie->buses);
 
 	return (void __iomem *)bus->area->addr;
 }
@@ -808,7 +808,7 @@ static int tegra_pcie_enable_controller(struct tegra_pcie *pcie)
 	value &= ~AFI_FUSE_PCIE_T0_GEN2_DIS;
 	afi_writel(pcie, value, AFI_FUSE);
 
-	/* initialze internal PHY, enable up to 16 PCIE lanes */
+	/* initialize internal PHY, enable up to 16 PCIE lanes */
 	pads_writel(pcie, 0x0, PADS_CTL_SEL);
 
 	/* override IDDQ to 1 on all 4 lanes */
@@ -1624,7 +1624,7 @@ static int tegra_pcie_probe(struct platform_device *pdev)
 	if (!pcie)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&pcie->busses);
+	INIT_LIST_HEAD(&pcie->buses);
 	INIT_LIST_HEAD(&pcie->ports);
 	pcie->soc_data = match->data;
 	pcie->dev = &pdev->dev;
diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c
index 1e1fea4d959b..e33b68be0391 100644
--- a/drivers/pci/host/pcie-designware.c
+++ b/drivers/pci/host/pcie-designware.c
@@ -197,7 +197,7 @@ static int find_valid_pos0(struct pcie_port *pp, int msgvec, int pos, int *pos0)
 			return -ENOSPC;
 		/*
 		 * Check if this position is at correct offset.nvec is always a
-		 * power of two. pos0 must be nvec bit alligned.
+		 * power of two. pos0 must be nvec bit aligned.
 		 */
 		if (pos % msgvec)
 			pos += msgvec - (pos % msgvec);
diff --git a/drivers/pci/hotplug/Kconfig b/drivers/pci/hotplug/Kconfig
index 0a648af89531..df8caec59789 100644
--- a/drivers/pci/hotplug/Kconfig
+++ b/drivers/pci/hotplug/Kconfig
@@ -133,8 +133,8 @@ config HOTPLUG_PCI_RPA_DLPAR
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rpadlpar_io.
- 
- 	  When in doubt, say N.
+
+	  When in doubt, say N.
 
 config HOTPLUG_PCI_SGI
 	tristate "SGI PCI Hotplug Support"
diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile
index 47ec8c80e16d..3e6532b945c1 100644
--- a/drivers/pci/hotplug/Makefile
+++ b/drivers/pci/hotplug/Makefile
@@ -31,7 +31,7 @@ pci_hotplug-objs	+=	cpci_hotplug_core.o	\
 				cpci_hotplug_pci.o
 endif
 ifdef CONFIG_ACPI
-pci_hotplug-objs 	+= 	acpi_pcihp.o
+pci_hotplug-objs	+=	acpi_pcihp.o
 endif
 
 cpqphp-objs		:=	cpqphp_core.o	\
diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index 8650d39db392..dca66bc44578 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -111,7 +111,7 @@ int acpiphp_register_attention(struct acpiphp_attention_info *info)
  * @info: must match the pointer used to register
  *
  * Description: This is used to un-register a hardware specific acpi
- * driver that manipulates the attention LED.  The pointer to the 
+ * driver that manipulates the attention LED.  The pointer to the
  * info struct must be the same as the one used to set it.
  */
 int acpiphp_unregister_attention(struct acpiphp_attention_info *info)
@@ -169,8 +169,8 @@ static int disable_slot(struct hotplug_slot *hotplug_slot)
  * was registered with us.  This allows hardware specific
  * ACPI implementations to blink the light for us.
  */
- static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status)
- {
+static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status)
+{
 	int retval = -ENODEV;
 
 	pr_debug("%s - physical_slot = %s\n", __func__,
@@ -182,8 +182,8 @@ static int disable_slot(struct hotplug_slot *hotplug_slot)
 	} else
 		attention_info = NULL;
 	return retval;
- }
- 
+}
+
 
 /**
  * get_power_status - get power status of a slot
@@ -323,7 +323,7 @@ int acpiphp_register_hotplug_slot(struct acpiphp_slot *acpiphp_slot,
 	if (retval) {
 		pr_err("pci_hp_register failed with error %d\n", retval);
 		goto error_hpslot;
- 	}
+	}
 
 	pr_info("Slot [%s] registered\n", slot_name(slot));
 
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 5b4e9eb0e8ff..1cf605f67673 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -325,7 +325,7 @@ static acpi_status register_slot(acpi_handle handle, u32 lvl, void *data,
 
 	list_add_tail(&slot->node, &bridge->slots);
 
-	/* Register slots for ejectable funtions only. */
+	/* Register slots for ejectable functions only. */
 	if (acpi_pci_check_ejectable(pbus, handle)  || is_dock_device(handle)) {
 		unsigned long long sun;
 		int retval;
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index 0d64c414bf78..ecfac7e72d91 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -116,7 +116,7 @@ static struct bin_attribute ibm_apci_table_attr = {
 	    .read = ibm_read_apci_table,
 	    .write = NULL,
 };
-static struct acpiphp_attention_info ibm_attention_info = 
+static struct acpiphp_attention_info ibm_attention_info =
 {
 	.set_attn = ibm_set_attention_status,
 	.get_attn = ibm_get_attention_status,
@@ -171,9 +171,9 @@ ibm_slot_done:
  */
 static int ibm_set_attention_status(struct hotplug_slot *slot, u8 status)
 {
-	union acpi_object args[2]; 
+	union acpi_object args[2];
 	struct acpi_object_list params = { .pointer = args, .count = 2 };
-	acpi_status stat; 
+	acpi_status stat;
 	unsigned long long rc;
 	union apci_descriptor *ibm_slot;
 
@@ -208,7 +208,7 @@ static int ibm_set_attention_status(struct hotplug_slot *slot, u8 status)
  *
  * Description: This method is registered with the acpiphp module as a
  * callback to do the device specific task of getting the LED status.
- * 
+ *
  * Because there is no direct method of getting the LED status directly
  * from an ACPI call, we read the aPCI table and parse out our
  * slot descriptor to read the status from that.
@@ -259,7 +259,7 @@ static void ibm_handle_events(acpi_handle handle, u32 event, void *context)
 	pr_debug("%s: Received notification %02x\n", __func__, event);
 
 	if (subevent == 0x80) {
-		pr_debug("%s: generationg bus event\n", __func__);
+		pr_debug("%s: generating bus event\n", __func__);
 		acpi_bus_generate_netlink_event(note->device->pnp.device_class,
 						  dev_name(&note->device->dev),
 						  note->event, detail);
@@ -387,7 +387,7 @@ static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
 		u32 lvl, void *context, void **rv)
 {
 	acpi_handle *phandle = (acpi_handle *)context;
-	acpi_status status; 
+	acpi_status status;
 	struct acpi_device_info *info;
 	int retval = 0;
 
@@ -405,7 +405,7 @@ static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
 			info->hardware_id.string, handle);
 		*phandle = handle;
 		/* returning non-zero causes the search to stop
-		 * and returns this value to the caller of 
+		 * and returns this value to the caller of
 		 * acpi_walk_namespace, but it also causes some warnings
 		 * in the acpi debug code to print...
 		 */
diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 2b4c412f94c3..00c81a3cefc9 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -46,7 +46,7 @@
 	do {							\
 		if (cpci_debug)					\
 			printk (KERN_DEBUG "%s: " format "\n",	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while (0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format "\n", MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format "\n", MY_NAME , ## arg)
diff --git a/drivers/pci/hotplug/cpci_hotplug_pci.c b/drivers/pci/hotplug/cpci_hotplug_pci.c
index d8add34177f2..d3add9819f63 100644
--- a/drivers/pci/hotplug/cpci_hotplug_pci.c
+++ b/drivers/pci/hotplug/cpci_hotplug_pci.c
@@ -39,7 +39,7 @@ extern int cpci_debug;
 	do {							\
 		if (cpci_debug)					\
 			printk (KERN_DEBUG "%s: " format "\n",	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while (0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format "\n", MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format "\n", MY_NAME , ## arg)
diff --git a/drivers/pci/hotplug/cpcihp_generic.c b/drivers/pci/hotplug/cpcihp_generic.c
index a6a71c41cdf8..7536eef620b0 100644
--- a/drivers/pci/hotplug/cpcihp_generic.c
+++ b/drivers/pci/hotplug/cpcihp_generic.c
@@ -13,14 +13,14 @@
  * option) any later version.
  *
  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 
- * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * You should have received a copy of the GNU General Public License along
@@ -53,9 +53,9 @@
 
 #define dbg(format, arg...)					\
 	do {							\
-		if(debug)					\
+		if (debug)					\
 			printk (KERN_DEBUG "%s: " format "\n",	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while(0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format "\n", MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format "\n", MY_NAME , ## arg)
diff --git a/drivers/pci/hotplug/cpcihp_zt5550.c b/drivers/pci/hotplug/cpcihp_zt5550.c
index 449b4bbc8301..e8c4a7ccf578 100644
--- a/drivers/pci/hotplug/cpcihp_zt5550.c
+++ b/drivers/pci/hotplug/cpcihp_zt5550.c
@@ -13,14 +13,14 @@
  * option) any later version.
  *
  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 
- * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * You should have received a copy of the GNU General Public License along
@@ -48,9 +48,9 @@
 
 #define dbg(format, arg...)					\
 	do {							\
-		if(debug)					\
+		if (debug)					\
 			printk (KERN_DEBUG "%s: " format "\n",	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while(0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format "\n", MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format "\n", MY_NAME , ## arg)
@@ -285,7 +285,7 @@ static struct pci_device_id zt5550_hc_pci_tbl[] = {
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, zt5550_hc_pci_tbl);
-	
+
 static struct pci_driver zt5550_hc_driver = {
 	.name		= "zt5550_hc",
 	.id_table	= zt5550_hc_pci_tbl,
diff --git a/drivers/pci/hotplug/cpcihp_zt5550.h b/drivers/pci/hotplug/cpcihp_zt5550.h
index bebc6060a558..9a57fda5348c 100644
--- a/drivers/pci/hotplug/cpcihp_zt5550.h
+++ b/drivers/pci/hotplug/cpcihp_zt5550.h
@@ -13,14 +13,14 @@
  * option) any later version.
  *
  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 
- * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * You should have received a copy of the GNU General Public License along
@@ -55,7 +55,7 @@
 #define HC_CMD_REG		0x0C
 #define ARB_CONFIG_GNT_REG	0x10
 #define ARB_CONFIG_CFG_REG	0x12
-#define ARB_CONFIG_REG	 	0x10
+#define ARB_CONFIG_REG		0x10
 #define ISOL_CONFIG_REG		0x18
 #define FAULT_STATUS_REG	0x20
 #define FAULT_CONFIG_REG	0x24
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index c8eaeb43fa5d..31273e155e6c 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -862,10 +862,10 @@ static int cpqhpc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_disable_device;
 	}
 
-	/* Check for the proper subsystem ID's
+	/* Check for the proper subsystem IDs
 	 * Intel uses a different SSID programming model than Compaq.
 	 * For Intel, each SSID bit identifies a PHP capability.
-	 * Also Intel HPC's may have RID=0.
+	 * Also Intel HPCs may have RID=0.
 	 */
 	if ((pdev->revision <= 2) && (vendor_id != PCI_VENDOR_ID_INTEL)) {
 		err(msg_HPC_not_supported);
diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c
index d282019cda5f..11845b796799 100644
--- a/drivers/pci/hotplug/cpqphp_ctrl.c
+++ b/drivers/pci/hotplug/cpqphp_ctrl.c
@@ -1231,7 +1231,7 @@ static u8 set_controller_speed(struct controller *ctrl, u8 adapter_speed, u8 hp_
 
 	/* Only if mode change...*/
 	if (((bus->cur_bus_speed == PCI_SPEED_66MHz) && (adapter_speed == PCI_SPEED_66MHz_PCIX)) ||
-		((bus->cur_bus_speed == PCI_SPEED_66MHz_PCIX) && (adapter_speed == PCI_SPEED_66MHz))) 
+		((bus->cur_bus_speed == PCI_SPEED_66MHz_PCIX) && (adapter_speed == PCI_SPEED_66MHz)))
 			set_SOGO(ctrl);
 
 	wait_for_ctrl_irq(ctrl);
@@ -1828,7 +1828,7 @@ static void interrupt_event_handler(struct controller *ctrl)
 
 				if (ctrl->event_queue[loop].event_type == INT_BUTTON_PRESS) {
 					dbg("button pressed\n");
-				} else if (ctrl->event_queue[loop].event_type == 
+				} else if (ctrl->event_queue[loop].event_type ==
 					   INT_BUTTON_CANCEL) {
 					dbg("button cancel\n");
 					del_timer(&p_slot->task_event);
@@ -2411,11 +2411,11 @@ static int configure_new_function(struct controller *ctrl, struct pci_func *func
 		if (rc)
 			return rc;
 
-		/* find range of busses to use */
+		/* find range of buses to use */
 		dbg("find ranges of buses to use\n");
 		bus_node = get_max_resource(&(resources->bus_head), 1);
 
-		/* If we don't have any busses to allocate, we can't continue */
+		/* If we don't have any buses to allocate, we can't continue */
 		if (!bus_node)
 			return -ENOMEM;
 
@@ -2900,7 +2900,7 @@ static int configure_new_function(struct controller *ctrl, struct pci_func *func
 
 			/* If this function needs an interrupt and we are behind
 			 * a bridge and the pin is tied to something that's
-			 * alread mapped, set this one the same */
+			 * already mapped, set this one the same */
 			if (temp_byte && resources->irqs &&
 			    (resources->irqs->valid_INT &
 			     (0x01 << ((temp_byte + resources->irqs->barber_pole - 1) & 0x03)))) {
diff --git a/drivers/pci/hotplug/cpqphp_pci.c b/drivers/pci/hotplug/cpqphp_pci.c
index 09801c6945ce..6e4a12c91adb 100644
--- a/drivers/pci/hotplug/cpqphp_pci.c
+++ b/drivers/pci/hotplug/cpqphp_pci.c
@@ -291,7 +291,7 @@ int cpqhp_get_bus_dev (struct controller *ctrl, u8 * bus_num, u8 * dev_num, u8 s
  *
  * Reads configuration for all slots in a PCI bus and saves info.
  *
- * Note:  For non-hot plug busses, the slot # saved is the device #
+ * Note:  For non-hot plug buses, the slot # saved is the device #
  *
  * returns 0 if success
  */
@@ -455,7 +455,7 @@ int cpqhp_save_config(struct controller *ctrl, int busnumber, int is_hot_plug)
  * cpqhp_save_slot_config
  *
  * Saves configuration info for all PCI devices in a given slot
- * including subordinate busses.
+ * including subordinate buses.
  *
  * returns 0 if success
  */
@@ -1556,4 +1556,3 @@ void cpqhp_destroy_board_resources (struct pci_func * func)
 		kfree(tres);
 	}
 }
-
diff --git a/drivers/pci/hotplug/ibmphp.h b/drivers/pci/hotplug/ibmphp.h
index 8c5b25871d02..e3e46a7b3ee7 100644
--- a/drivers/pci/hotplug/ibmphp.h
+++ b/drivers/pci/hotplug/ibmphp.h
@@ -59,7 +59,7 @@ extern int ibmphp_debug;
 
 
 /************************************************************
-*  RESOURE TYPE                                             *
+*  RESOURCE TYPE                                             *
 ************************************************************/
 
 #define EBDA_RSRC_TYPE_MASK		0x03
@@ -103,7 +103,7 @@ extern int ibmphp_debug;
 //--------------------------------------------------------------
 
 struct rio_table_hdr {
-	u8 ver_num; 
+	u8 ver_num;
 	u8 scal_count;
 	u8 riodev_count;
 	u16 offset;
@@ -127,7 +127,7 @@ struct scal_detail {
 };
 
 //--------------------------------------------------------------
-// RIO DETAIL 
+// RIO DETAIL
 //--------------------------------------------------------------
 
 struct rio_detail {
@@ -152,7 +152,7 @@ struct opt_rio {
 	u8 first_slot_num;
 	u8 middle_num;
 	struct list_head opt_rio_list;
-};	
+};
 
 struct opt_rio_lo {
 	u8 rio_type;
@@ -161,7 +161,7 @@ struct opt_rio_lo {
 	u8 middle_num;
 	u8 pack_count;
 	struct list_head opt_rio_lo_list;
-};	
+};
 
 /****************************************************************
 *  HPC DESCRIPTOR NODE                                          *
@@ -574,7 +574,7 @@ void ibmphp_hpc_stop_poll_thread(void);
 #define HPC_CTLR_IRQ_PENDG	0x80
 
 //----------------------------------------------------------------------------
-// HPC_CTLR_WROKING status return codes
+// HPC_CTLR_WORKING status return codes
 //----------------------------------------------------------------------------
 #define HPC_CTLR_WORKING_NO	0x00
 #define HPC_CTLR_WORKING_YES	0x01
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index cbd72d81d253..efdc13adbe41 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION (DRIVER_DESC);
 struct pci_bus *ibmphp_pci_bus;
 static int max_slots;
 
-static int irqs[16];    /* PIC mode IRQ's we're using so far (in case MPS
+static int irqs[16];    /* PIC mode IRQs we're using so far (in case MPS
 			 * tables don't provide default info for empty slots */
 
 static int init_flag;
@@ -71,20 +71,20 @@ static inline int get_max_adapter_speed (struct hotplug_slot *hs, u8 *value)
 	return get_max_adapter_speed_1 (hs, value, 1);
 }
 */
-static inline int get_cur_bus_info(struct slot **sl) 
+static inline int get_cur_bus_info(struct slot **sl)
 {
 	int rc = 1;
 	struct slot * slot_cur = *sl;
 
 	debug("options = %x\n", slot_cur->ctrl->options);
-	debug("revision = %x\n", slot_cur->ctrl->revision);	
+	debug("revision = %x\n", slot_cur->ctrl->revision);
 
-	if (READ_BUS_STATUS(slot_cur->ctrl)) 
+	if (READ_BUS_STATUS(slot_cur->ctrl))
 		rc = ibmphp_hpc_readslot(slot_cur, READ_BUSSTATUS, NULL);
-	
-	if (rc) 
+
+	if (rc)
 		return rc;
-	  
+
 	slot_cur->bus_on->current_speed = CURRENT_BUS_SPEED(slot_cur->busstatus);
 	if (READ_BUS_MODE(slot_cur->ctrl))
 		slot_cur->bus_on->current_bus_mode =
@@ -96,7 +96,7 @@ static inline int get_cur_bus_info(struct slot **sl)
 			slot_cur->busstatus,
 			slot_cur->bus_on->current_speed,
 			slot_cur->bus_on->current_bus_mode);
-	
+
 	*sl = slot_cur;
 	return 0;
 }
@@ -104,8 +104,8 @@ static inline int get_cur_bus_info(struct slot **sl)
 static inline int slot_update(struct slot **sl)
 {
 	int rc;
- 	rc = ibmphp_hpc_readslot(*sl, READ_ALLSTAT, NULL);
-	if (rc) 
+	rc = ibmphp_hpc_readslot(*sl, READ_ALLSTAT, NULL);
+	if (rc)
 		return rc;
 	if (!init_flag)
 		rc = get_cur_bus_info(sl);
@@ -172,7 +172,7 @@ int ibmphp_init_devno(struct slot **cur_slot)
 			debug("(*cur_slot)->irq[3] = %x\n",
 					(*cur_slot)->irq[3]);
 
-			debug("rtable->exlusive_irqs = %x\n",
+			debug("rtable->exclusive_irqs = %x\n",
 					rtable->exclusive_irqs);
 			debug("rtable->slots[loop].irq[0].bitmap = %x\n",
 					rtable->slots[loop].irq[0].bitmap);
@@ -271,7 +271,7 @@ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 value)
 			else
 				rc = -ENODEV;
 		}
-	} else	
+	} else
 		rc = -ENODEV;
 
 	ibmphp_unlock_operations();
@@ -288,7 +288,7 @@ static int get_attention_status(struct hotplug_slot *hotplug_slot, u8 * value)
 
 	debug("get_attention_status - Entry hotplug_slot[%lx] pvalue[%lx]\n",
 					(ulong) hotplug_slot, (ulong) value);
-        
+
 	ibmphp_lock_operations();
 	if (hotplug_slot) {
 		pslot = hotplug_slot->private;
@@ -406,14 +406,14 @@ static int get_max_bus_speed(struct slot *slot)
 
 	ibmphp_lock_operations();
 	mode = slot->supported_bus_mode;
-	speed = slot->supported_speed; 
+	speed = slot->supported_speed;
 	ibmphp_unlock_operations();
 
 	switch (speed) {
 	case BUS_SPEED_33:
 		break;
 	case BUS_SPEED_66:
-		if (mode == BUS_MODE_PCIX) 
+		if (mode == BUS_MODE_PCIX)
 			speed += 0x01;
 		break;
 	case BUS_SPEED_100:
@@ -515,13 +515,13 @@ static int __init init_ops(void)
 
 		debug("BEFORE GETTING SLOT STATUS, slot # %x\n",
 							slot_cur->number);
-		if (slot_cur->ctrl->revision == 0xFF) 
+		if (slot_cur->ctrl->revision == 0xFF)
 			if (get_ctrl_revision(slot_cur,
 						&slot_cur->ctrl->revision))
 				return -1;
 
-		if (slot_cur->bus_on->current_speed == 0xFF) 
-			if (get_cur_bus_info(&slot_cur)) 
+		if (slot_cur->bus_on->current_speed == 0xFF)
+			if (get_cur_bus_info(&slot_cur))
 				return -1;
 		get_max_bus_speed(slot_cur);
 
@@ -539,8 +539,8 @@ static int __init init_ops(void)
 		debug("SLOT_PRESENT = %x\n", SLOT_PRESENT(slot_cur->status));
 		debug("SLOT_LATCH = %x\n", SLOT_LATCH(slot_cur->status));
 
-		if ((SLOT_PWRGD(slot_cur->status)) && 
-		    !(SLOT_PRESENT(slot_cur->status)) && 
+		if ((SLOT_PWRGD(slot_cur->status)) &&
+		    !(SLOT_PRESENT(slot_cur->status)) &&
 		    !(SLOT_LATCH(slot_cur->status))) {
 			debug("BEFORE POWER OFF COMMAND\n");
 				rc = power_off(slot_cur);
@@ -581,13 +581,13 @@ static int validate(struct slot *slot_cur, int opn)
 
 	switch (opn) {
 		case ENABLE:
-			if (!(SLOT_PWRGD(slot_cur->status)) && 
-			     (SLOT_PRESENT(slot_cur->status)) && 
+			if (!(SLOT_PWRGD(slot_cur->status)) &&
+			     (SLOT_PRESENT(slot_cur->status)) &&
 			     !(SLOT_LATCH(slot_cur->status)))
 				return 0;
 			break;
 		case DISABLE:
-			if ((SLOT_PWRGD(slot_cur->status)) && 
+			if ((SLOT_PWRGD(slot_cur->status)) &&
 			    (SLOT_PRESENT(slot_cur->status)) &&
 			    !(SLOT_LATCH(slot_cur->status)))
 				return 0;
@@ -617,7 +617,7 @@ int ibmphp_update_slot_info(struct slot *slot_cur)
 		err("out of system memory\n");
 		return -ENOMEM;
 	}
-        
+
 	info->power_status = SLOT_PWRGD(slot_cur->status);
 	info->attention_status = SLOT_ATTN(slot_cur->status,
 						slot_cur->ext_status);
@@ -638,7 +638,7 @@ int ibmphp_update_slot_info(struct slot *slot_cur)
 		case BUS_SPEED_33:
 			break;
 		case BUS_SPEED_66:
-			if (mode == BUS_MODE_PCIX) 
+			if (mode == BUS_MODE_PCIX)
 				bus_speed += 0x01;
 			else if (mode == BUS_MODE_PCI)
 				;
@@ -654,8 +654,8 @@ int ibmphp_update_slot_info(struct slot *slot_cur)
 	}
 
 	bus->cur_bus_speed = bus_speed;
-	// To do: bus_names 
-	
+	// To do: bus_names
+
 	rc = pci_hp_change_slot_info(slot_cur->hotplug_slot, info);
 	kfree(info);
 	return rc;
@@ -729,8 +729,8 @@ static void ibm_unconfigure_device(struct pci_func *func)
 }
 
 /*
- * The following function is to fix kernel bug regarding 
- * getting bus entries, here we manually add those primary 
+ * The following function is to fix kernel bug regarding
+ * getting bus entries, here we manually add those primary
  * bus entries to kernel bus structure whenever apply
  */
 static u8 bus_structure_fixup(u8 busno)
@@ -814,7 +814,7 @@ static int ibm_configure_device(struct pci_func *func)
 }
 
 /*******************************************************
- * Returns whether the bus is empty or not 
+ * Returns whether the bus is empty or not
  *******************************************************/
 static int is_bus_empty(struct slot * slot_cur)
 {
@@ -842,7 +842,7 @@ static int is_bus_empty(struct slot * slot_cur)
 }
 
 /***********************************************************
- * If the HPC permits and the bus currently empty, tries to set the 
+ * If the HPC permits and the bus currently empty, tries to set the
  * bus speed and mode at the maximum card and bus capability
  * Parameters: slot
  * Returns: bus is set (0) or error code
@@ -856,7 +856,7 @@ static int set_bus(struct slot * slot_cur)
 	static struct pci_device_id ciobx[] = {
 		{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, 0x0101) },
 	        { },
-	};	
+	};
 
 	debug("%s - entry slot # %d\n", __func__, slot_cur->number);
 	if (SET_BUS_STATUS(slot_cur->ctrl) && is_bus_empty(slot_cur)) {
@@ -877,7 +877,7 @@ static int set_bus(struct slot * slot_cur)
 				else if (!SLOT_BUS_MODE(slot_cur->ext_status))
 					/* if max slot/bus capability is 66 pci
 					and there's no bus mode mismatch, then
-					the adapter supports 66 pci */ 
+					the adapter supports 66 pci */
 					cmd = HPC_BUS_66CONVMODE;
 				else
 					cmd = HPC_BUS_33CONVMODE;
@@ -930,7 +930,7 @@ static int set_bus(struct slot * slot_cur)
 			return -EIO;
 		}
 	}
-	/* This is for x440, once Brandon fixes the firmware, 
+	/* This is for x440, once Brandon fixes the firmware,
 	will not need this delay */
 	msleep(1000);
 	debug("%s -Exit\n", __func__);
@@ -938,9 +938,9 @@ static int set_bus(struct slot * slot_cur)
 }
 
 /* This routine checks the bus limitations that the slot is on from the BIOS.
- * This is used in deciding whether or not to power up the slot.  
+ * This is used in deciding whether or not to power up the slot.
  * (electrical/spec limitations. For example, >1 133 MHz or >2 66 PCI cards on
- * same bus) 
+ * same bus)
  * Parameters: slot
  * Returns: 0 = no limitations, -EINVAL = exceeded limitations on the bus
  */
@@ -986,7 +986,7 @@ static int check_limitations(struct slot *slot_cur)
 static inline void print_card_capability(struct slot *slot_cur)
 {
 	info("capability of the card is ");
-	if ((slot_cur->ext_status & CARD_INFO) == PCIX133) 
+	if ((slot_cur->ext_status & CARD_INFO) == PCIX133)
 		info("   133 MHz PCI-X\n");
 	else if ((slot_cur->ext_status & CARD_INFO) == PCIX66)
 		info("    66 MHz PCI-X\n");
@@ -1020,7 +1020,7 @@ static int enable_slot(struct hotplug_slot *hs)
 	}
 
 	attn_LED_blink(slot_cur);
-	
+
 	rc = set_bus(slot_cur);
 	if (rc) {
 		err("was not able to set the bus\n");
@@ -1082,7 +1082,7 @@ static int enable_slot(struct hotplug_slot *hs)
 	rc = slot_update(&slot_cur);
 	if (rc)
 		goto error_power;
-	
+
 	rc = -EINVAL;
 	if (SLOT_POWER(slot_cur->status) && !(SLOT_PWRGD(slot_cur->status))) {
 		err("power fault occurred trying to power up...\n");
@@ -1093,7 +1093,7 @@ static int enable_slot(struct hotplug_slot *hs)
 					"speed and card capability\n");
 		print_card_capability(slot_cur);
 		goto error_power;
-	} 
+	}
 	/* Don't think this case will happen after above checks...
 	 * but just in case, for paranoia sake */
 	if (!(SLOT_POWER(slot_cur->status))) {
@@ -1144,7 +1144,7 @@ static int enable_slot(struct hotplug_slot *hs)
 	ibmphp_print_test();
 	rc = ibmphp_update_slot_info(slot_cur);
 exit:
-	ibmphp_unlock_operations(); 
+	ibmphp_unlock_operations();
 	return rc;
 
 error_nopower:
@@ -1180,7 +1180,7 @@ static int ibmphp_disable_slot(struct hotplug_slot *hotplug_slot)
 {
 	struct slot *slot = hotplug_slot->private;
 	int rc;
-	
+
 	ibmphp_lock_operations();
 	rc = ibmphp_do_disable_slot(slot);
 	ibmphp_unlock_operations();
@@ -1192,12 +1192,12 @@ int ibmphp_do_disable_slot(struct slot *slot_cur)
 	int rc;
 	u8 flag;
 
-	debug("DISABLING SLOT...\n"); 
-		
+	debug("DISABLING SLOT...\n");
+
 	if ((slot_cur == NULL) || (slot_cur->ctrl == NULL)) {
 		return -ENODEV;
 	}
-	
+
 	flag = slot_cur->flag;
 	slot_cur->flag = 1;
 
@@ -1210,7 +1210,7 @@ int ibmphp_do_disable_slot(struct slot *slot_cur)
 	attn_LED_blink(slot_cur);
 
 	if (slot_cur->func == NULL) {
-		/* We need this for fncs's that were there on bootup */
+		/* We need this for functions that were there on bootup */
 		slot_cur->func = kzalloc(sizeof(struct pci_func), GFP_KERNEL);
 		if (!slot_cur->func) {
 			err("out of system memory\n");
@@ -1222,12 +1222,13 @@ int ibmphp_do_disable_slot(struct slot *slot_cur)
 	}
 
 	ibm_unconfigure_device(slot_cur->func);
-        
-	/* If we got here from latch suddenly opening on operating card or 
-	a power fault, there's no power to the card, so cannot
-	read from it to determine what resources it occupied.  This operation
-	is forbidden anyhow.  The best we can do is remove it from kernel
-	lists at least */
+
+	/*
+	 * If we got here from latch suddenly opening on operating card or
+	 * a power fault, there's no power to the card, so cannot
+	 * read from it to determine what resources it occupied.  This operation
+	 * is forbidden anyhow.  The best we can do is remove it from kernel
+	 * lists at least */
 
 	if (!flag) {
 		attn_off(slot_cur);
@@ -1264,7 +1265,7 @@ error:
 		rc = -EFAULT;
 		goto exit;
 	}
-	if (flag)		
+	if (flag)
 		ibmphp_update_slot_info(slot_cur);
 	goto exit;
 }
@@ -1339,7 +1340,7 @@ static int __init ibmphp_init(void)
 	debug("AFTER Resource & EBDA INITIALIZATIONS\n");
 
 	max_slots = get_max_slots();
-	
+
 	if ((rc = ibmphp_register_pci()))
 		goto error;
 
diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c
index 9df78bc14541..bd044158b36c 100644
--- a/drivers/pci/hotplug/ibmphp_ebda.c
+++ b/drivers/pci/hotplug/ibmphp_ebda.c
@@ -123,7 +123,7 @@ static struct ebda_pci_rsrc *alloc_ebda_pci_rsrc (void)
 static void __init print_bus_info (void)
 {
 	struct bus_info *ptr;
-	
+
 	list_for_each_entry(ptr, &bus_info_head, bus_info_list) {
 		debug ("%s - slot_min = %x\n", __func__, ptr->slot_min);
 		debug ("%s - slot_max = %x\n", __func__, ptr->slot_max);
@@ -131,7 +131,7 @@ static void __init print_bus_info (void)
 		debug ("%s - bus# = %x\n", __func__, ptr->busno);
 		debug ("%s - current_speed = %x\n", __func__, ptr->current_speed);
 		debug ("%s - controller_id = %x\n", __func__, ptr->controller_id);
-		
+
 		debug ("%s - slots_at_33_conv = %x\n", __func__, ptr->slots_at_33_conv);
 		debug ("%s - slots_at_66_conv = %x\n", __func__, ptr->slots_at_66_conv);
 		debug ("%s - slots_at_66_pcix = %x\n", __func__, ptr->slots_at_66_pcix);
@@ -144,7 +144,7 @@ static void __init print_bus_info (void)
 static void print_lo_info (void)
 {
 	struct rio_detail *ptr;
-	debug ("print_lo_info ----\n");	
+	debug ("print_lo_info ----\n");
 	list_for_each_entry(ptr, &rio_lo_head, rio_detail_list) {
 		debug ("%s - rio_node_id = %x\n", __func__, ptr->rio_node_id);
 		debug ("%s - rio_type = %x\n", __func__, ptr->rio_type);
@@ -176,7 +176,7 @@ static void __init print_ebda_pci_rsrc (void)
 	struct ebda_pci_rsrc *ptr;
 
 	list_for_each_entry(ptr, &ibmphp_ebda_pci_rsrc_head, ebda_pci_rsrc_list) {
-		debug ("%s - rsrc type: %x bus#: %x dev_func: %x start addr: %x end addr: %x\n", 
+		debug ("%s - rsrc type: %x bus#: %x dev_func: %x start addr: %x end addr: %x\n",
 			__func__, ptr->rsrc_type ,ptr->bus_num, ptr->dev_fun,ptr->start_addr, ptr->end_addr);
 	}
 }
@@ -259,7 +259,7 @@ int __init ibmphp_access_ebda (void)
 	ebda_seg = readw (io_mem);
 	iounmap (io_mem);
 	debug ("returned ebda segment: %x\n", ebda_seg);
-	
+
 	io_mem = ioremap(ebda_seg<<4, 1);
 	if (!io_mem)
 		return -ENOMEM;
@@ -310,7 +310,7 @@ int __init ibmphp_access_ebda (void)
 			re = readw (io_mem + sub_addr);	/* next sub blk */
 
 			sub_addr += 2;
-			rc_id = readw (io_mem + sub_addr); 	/* sub blk id */
+			rc_id = readw (io_mem + sub_addr);	/* sub blk id */
 
 			sub_addr += 2;
 			if (rc_id != 0x5243)
@@ -330,7 +330,7 @@ int __init ibmphp_access_ebda (void)
 			debug ("info about hpc descriptor---\n");
 			debug ("hot blk format: %x\n", format);
 			debug ("num of controller: %x\n", num_ctlrs);
-			debug ("offset of hpc data structure enteries: %x\n ", sub_addr);
+			debug ("offset of hpc data structure entries: %x\n ", sub_addr);
 
 			sub_addr = base + re;	/* re sub blk */
 			/* FIXME: rc is never used/checked */
@@ -359,7 +359,7 @@ int __init ibmphp_access_ebda (void)
 			debug ("info about rsrc descriptor---\n");
 			debug ("format: %x\n", format);
 			debug ("num of rsrc: %x\n", num_entries);
-			debug ("offset of rsrc data structure enteries: %x\n ", sub_addr);
+			debug ("offset of rsrc data structure entries: %x\n ", sub_addr);
 
 			hs_complete = 1;
 		} else {
@@ -376,7 +376,7 @@ int __init ibmphp_access_ebda (void)
 			rio_table_ptr->scal_count = readb (io_mem + offset + 1);
 			rio_table_ptr->riodev_count = readb (io_mem + offset + 2);
 			rio_table_ptr->offset = offset +3 ;
-			
+
 			debug("info about rio table hdr ---\n");
 			debug("ver_num: %x\nscal_count: %x\nriodev_count: %x\noffset of rio table: %x\n ",
 				rio_table_ptr->ver_num, rio_table_ptr->scal_count,
@@ -440,12 +440,12 @@ static int __init ebda_rio_table (void)
 		rio_detail_ptr->chassis_num = readb (io_mem + offset + 14);
 //		debug ("rio_node_id: %x\nbbar: %x\nrio_type: %x\nowner_id: %x\nport0_node: %x\nport0_port: %x\nport1_node: %x\nport1_port: %x\nfirst_slot_num: %x\nstatus: %x\n", rio_detail_ptr->rio_node_id, rio_detail_ptr->bbar, rio_detail_ptr->rio_type, rio_detail_ptr->owner_id, rio_detail_ptr->port0_node_connect, rio_detail_ptr->port0_port_connect, rio_detail_ptr->port1_node_connect, rio_detail_ptr->port1_port_connect, rio_detail_ptr->first_slot_num, rio_detail_ptr->status);
 		//create linked list of chassis
-		if (rio_detail_ptr->rio_type == 4 || rio_detail_ptr->rio_type == 5) 
+		if (rio_detail_ptr->rio_type == 4 || rio_detail_ptr->rio_type == 5)
 			list_add (&rio_detail_ptr->rio_detail_list, &rio_vg_head);
-		//create linked list of expansion box				
-		else if (rio_detail_ptr->rio_type == 6 || rio_detail_ptr->rio_type == 7) 
+		//create linked list of expansion box
+		else if (rio_detail_ptr->rio_type == 6 || rio_detail_ptr->rio_type == 7)
 			list_add (&rio_detail_ptr->rio_detail_list, &rio_lo_head);
-		else 
+		else
 			// not in my concern
 			kfree (rio_detail_ptr);
 		offset += 15;
@@ -456,7 +456,7 @@ static int __init ebda_rio_table (void)
 }
 
 /*
- * reorganizing linked list of chassis	 
+ * reorganizing linked list of chassis
  */
 static struct opt_rio *search_opt_vg (u8 chassis_num)
 {
@@ -464,7 +464,7 @@ static struct opt_rio *search_opt_vg (u8 chassis_num)
 	list_for_each_entry(ptr, &opt_vg_head, opt_rio_list) {
 		if (ptr->chassis_num == chassis_num)
 			return ptr;
-	}		
+	}
 	return NULL;
 }
 
@@ -472,7 +472,7 @@ static int __init combine_wpg_for_chassis (void)
 {
 	struct opt_rio *opt_rio_ptr = NULL;
 	struct rio_detail *rio_detail_ptr = NULL;
-	
+
 	list_for_each_entry(rio_detail_ptr, &rio_vg_head, rio_detail_list) {
 		opt_rio_ptr = search_opt_vg (rio_detail_ptr->chassis_num);
 		if (!opt_rio_ptr) {
@@ -484,14 +484,14 @@ static int __init combine_wpg_for_chassis (void)
 			opt_rio_ptr->first_slot_num = rio_detail_ptr->first_slot_num;
 			opt_rio_ptr->middle_num = rio_detail_ptr->first_slot_num;
 			list_add (&opt_rio_ptr->opt_rio_list, &opt_vg_head);
-		} else {	
+		} else {
 			opt_rio_ptr->first_slot_num = min (opt_rio_ptr->first_slot_num, rio_detail_ptr->first_slot_num);
 			opt_rio_ptr->middle_num = max (opt_rio_ptr->middle_num, rio_detail_ptr->first_slot_num);
-		}	
+		}
 	}
 	print_opt_vg ();
-	return 0;	
-}	
+	return 0;
+}
 
 /*
  * reorganizing linked list of expansion box
@@ -502,7 +502,7 @@ static struct opt_rio_lo *search_opt_lo (u8 chassis_num)
 	list_for_each_entry(ptr, &opt_lo_head, opt_rio_lo_list) {
 		if (ptr->chassis_num == chassis_num)
 			return ptr;
-	}		
+	}
 	return NULL;
 }
 
@@ -510,7 +510,7 @@ static int combine_wpg_for_expansion (void)
 {
 	struct opt_rio_lo *opt_rio_lo_ptr = NULL;
 	struct rio_detail *rio_detail_ptr = NULL;
-	
+
 	list_for_each_entry(rio_detail_ptr, &rio_lo_head, rio_detail_list) {
 		opt_rio_lo_ptr = search_opt_lo (rio_detail_ptr->chassis_num);
 		if (!opt_rio_lo_ptr) {
@@ -522,22 +522,22 @@ static int combine_wpg_for_expansion (void)
 			opt_rio_lo_ptr->first_slot_num = rio_detail_ptr->first_slot_num;
 			opt_rio_lo_ptr->middle_num = rio_detail_ptr->first_slot_num;
 			opt_rio_lo_ptr->pack_count = 1;
-			
+
 			list_add (&opt_rio_lo_ptr->opt_rio_lo_list, &opt_lo_head);
-		} else {	
+		} else {
 			opt_rio_lo_ptr->first_slot_num = min (opt_rio_lo_ptr->first_slot_num, rio_detail_ptr->first_slot_num);
 			opt_rio_lo_ptr->middle_num = max (opt_rio_lo_ptr->middle_num, rio_detail_ptr->first_slot_num);
 			opt_rio_lo_ptr->pack_count = 2;
-		}	
+		}
 	}
-	return 0;	
+	return 0;
 }
-	
+
 
 /* Since we don't know the max slot number per each chassis, hence go
  * through the list of all chassis to find out the range
- * Arguments: slot_num, 1st slot number of the chassis we think we are on, 
- * var (0 = chassis, 1 = expansion box) 
+ * Arguments: slot_num, 1st slot number of the chassis we think we are on,
+ * var (0 = chassis, 1 = expansion box)
  */
 static int first_slot_num (u8 slot_num, u8 first_slot, u8 var)
 {
@@ -547,7 +547,7 @@ static int first_slot_num (u8 slot_num, u8 first_slot, u8 var)
 
 	if (!var) {
 		list_for_each_entry(opt_vg_ptr, &opt_vg_head, opt_rio_list) {
-			if ((first_slot < opt_vg_ptr->first_slot_num) && (slot_num >= opt_vg_ptr->first_slot_num)) { 
+			if ((first_slot < opt_vg_ptr->first_slot_num) && (slot_num >= opt_vg_ptr->first_slot_num)) {
 				rc = -ENODEV;
 				break;
 			}
@@ -569,7 +569,7 @@ static struct opt_rio_lo * find_rxe_num (u8 slot_num)
 
 	list_for_each_entry(opt_lo_ptr, &opt_lo_head, opt_rio_lo_list) {
 		//check to see if this slot_num belongs to expansion box
-		if ((slot_num >= opt_lo_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_lo_ptr->first_slot_num, 1))) 
+		if ((slot_num >= opt_lo_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_lo_ptr->first_slot_num, 1)))
 			return opt_lo_ptr;
 	}
 	return NULL;
@@ -580,8 +580,8 @@ static struct opt_rio * find_chassis_num (u8 slot_num)
 	struct opt_rio *opt_vg_ptr;
 
 	list_for_each_entry(opt_vg_ptr, &opt_vg_head, opt_rio_list) {
-		//check to see if this slot_num belongs to chassis 
-		if ((slot_num >= opt_vg_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_vg_ptr->first_slot_num, 0))) 
+		//check to see if this slot_num belongs to chassis
+		if ((slot_num >= opt_vg_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_vg_ptr->first_slot_num, 0)))
 			return opt_vg_ptr;
 	}
 	return NULL;
@@ -594,13 +594,13 @@ static u8 calculate_first_slot (u8 slot_num)
 {
 	u8 first_slot = 1;
 	struct slot * slot_cur;
-	
+
 	list_for_each_entry(slot_cur, &ibmphp_slot_head, ibm_slot_list) {
 		if (slot_cur->ctrl) {
-			if ((slot_cur->ctrl->ctlr_type != 4) && (slot_cur->ctrl->ending_slot_num > first_slot) && (slot_num > slot_cur->ctrl->ending_slot_num)) 
+			if ((slot_cur->ctrl->ctlr_type != 4) && (slot_cur->ctrl->ending_slot_num > first_slot) && (slot_num > slot_cur->ctrl->ending_slot_num))
 				first_slot = slot_cur->ctrl->ending_slot_num;
 		}
-	}			
+	}
 	return first_slot + 1;
 
 }
@@ -622,11 +622,11 @@ static char *create_file_name (struct slot * slot_cur)
 		err ("Structure passed is empty\n");
 		return NULL;
 	}
-	
+
 	slot_num = slot_cur->number;
 
 	memset (str, 0, sizeof(str));
-	
+
 	if (rio_table_ptr) {
 		if (rio_table_ptr->ver_num == 3) {
 			opt_vg_ptr = find_chassis_num (slot_num);
@@ -660,7 +660,7 @@ static char *create_file_name (struct slot * slot_cur)
 			/* if both NULL and we DO have correct RIO table in BIOS */
 			return NULL;
 		}
-	} 
+	}
 	if (!flag) {
 		if (slot_cur->ctrl->ctlr_type == 4) {
 			first_slot = calculate_first_slot (slot_num);
@@ -798,7 +798,7 @@ static int __init ebda_rsrc_controller (void)
 			slot_ptr->ctl_index = readb (io_mem + addr_slot + 2*slot_num);
 			slot_ptr->slot_cap = readb (io_mem + addr_slot + 3*slot_num);
 
-			// create bus_info lined list --- if only one slot per bus: slot_min = slot_max 
+			// create bus_info lined list --- if only one slot per bus: slot_min = slot_max
 
 			bus_info_ptr2 = ibmphp_find_same_bus_num (slot_ptr->slot_bus_num);
 			if (!bus_info_ptr2) {
@@ -814,9 +814,9 @@ static int __init ebda_rsrc_controller (void)
 				bus_info_ptr1->index = bus_index++;
 				bus_info_ptr1->current_speed = 0xff;
 				bus_info_ptr1->current_bus_mode = 0xff;
-				
+
 				bus_info_ptr1->controller_id = hpc_ptr->ctlr_id;
-				
+
 				list_add_tail (&bus_info_ptr1->bus_info_list, &bus_info_head);
 
 			} else {
@@ -851,7 +851,7 @@ static int __init ebda_rsrc_controller (void)
 				bus_info_ptr2->slots_at_66_conv = bus_ptr->slots_at_66_conv;
 				bus_info_ptr2->slots_at_66_pcix = bus_ptr->slots_at_66_pcix;
 				bus_info_ptr2->slots_at_100_pcix = bus_ptr->slots_at_100_pcix;
-				bus_info_ptr2->slots_at_133_pcix = bus_ptr->slots_at_133_pcix; 
+				bus_info_ptr2->slots_at_133_pcix = bus_ptr->slots_at_133_pcix;
 			}
 			bus_ptr++;
 		}
@@ -864,7 +864,7 @@ static int __init ebda_rsrc_controller (void)
 				hpc_ptr->u.pci_ctlr.dev_fun = readb (io_mem + addr + 1);
 				hpc_ptr->irq = readb (io_mem + addr + 2);
 				addr += 3;
-				debug ("ctrl bus = %x, ctlr devfun = %x, irq = %x\n", 
+				debug ("ctrl bus = %x, ctlr devfun = %x, irq = %x\n",
 					hpc_ptr->u.pci_ctlr.bus,
 					hpc_ptr->u.pci_ctlr.dev_fun, hpc_ptr->irq);
 				break;
@@ -932,7 +932,7 @@ static int __init ebda_rsrc_controller (void)
 				tmp_slot->supported_speed =  2;
 			else if ((hpc_ptr->slots[index].slot_cap & EBDA_SLOT_66_MAX) == EBDA_SLOT_66_MAX)
 				tmp_slot->supported_speed =  1;
-				
+
 			if ((hpc_ptr->slots[index].slot_cap & EBDA_SLOT_PCIX_CAP) == EBDA_SLOT_PCIX_CAP)
 				tmp_slot->supported_bus_mode = 1;
 			else
@@ -1000,7 +1000,7 @@ error_no_hpc:
 	return rc;
 }
 
-/* 
+/*
  * map info (bus, devfun, start addr, end addr..) of i/o, memory,
  * pfm from the physical addr to a list of resource.
  */
@@ -1057,7 +1057,7 @@ static int __init ebda_rsrc_rsrc (void)
 			addr += 10;
 
 			debug ("rsrc from mem or pfm ---\n");
-			debug ("rsrc type: %x bus#: %x dev_func: %x start addr: %x end addr: %x\n", 
+			debug ("rsrc type: %x bus#: %x dev_func: %x start addr: %x end addr: %x\n",
 				rsrc_ptr->rsrc_type, rsrc_ptr->bus_num, rsrc_ptr->dev_fun, rsrc_ptr->start_addr, rsrc_ptr->end_addr);
 
 			list_add (&rsrc_ptr->ebda_pci_rsrc_list, &ibmphp_ebda_pci_rsrc_head);
@@ -1096,7 +1096,7 @@ struct bus_info *ibmphp_find_same_bus_num (u32 num)
 	struct bus_info *ptr;
 
 	list_for_each_entry(ptr, &bus_info_head, bus_info_list) {
-		if (ptr->busno == num) 
+		if (ptr->busno == num)
 			 return ptr;
 	}
 	return NULL;
@@ -1110,7 +1110,7 @@ int ibmphp_get_bus_index (u8 num)
 	struct bus_info *ptr;
 
 	list_for_each_entry(ptr, &bus_info_head, bus_info_list) {
-		if (ptr->busno == num)  
+		if (ptr->busno == num)
 			return ptr->index;
 	}
 	return -ENODEV;
@@ -1168,7 +1168,7 @@ static struct pci_device_id id_table[] = {
 		.subdevice	= HPC_SUBSYSTEM_ID,
 		.class		= ((PCI_CLASS_SYSTEM_PCI_HOTPLUG << 8) | 0x00),
 	}, {}
-};		
+};
 
 MODULE_DEVICE_TABLE(pci, id_table);
 
@@ -1197,7 +1197,7 @@ static int ibmphp_probe (struct pci_dev * dev, const struct pci_device_id *ids)
 	struct controller *ctrl;
 
 	debug ("inside ibmphp_probe\n");
-	
+
 	list_for_each_entry(ctrl, &ebda_hpc_head, ebda_hpc_list) {
 		if (ctrl->ctlr_type == 1) {
 			if ((dev->devfn == ctrl->u.pci_ctlr.dev_fun) && (dev->bus->number == ctrl->u.pci_ctlr.bus)) {
@@ -1210,4 +1210,3 @@ static int ibmphp_probe (struct pci_dev * dev, const struct pci_device_id *ids)
 	}
 	return -ENODEV;
 }
-
diff --git a/drivers/pci/hotplug/ibmphp_hpc.c b/drivers/pci/hotplug/ibmphp_hpc.c
index f59ed30512b5..5fc7a089f532 100644
--- a/drivers/pci/hotplug/ibmphp_hpc.c
+++ b/drivers/pci/hotplug/ibmphp_hpc.c
@@ -258,7 +258,7 @@ static u8 i2c_ctrl_write (struct controller *ctlr_ptr, void __iomem *WPGBbar, u8
 {
 	u8 rc;
 	void __iomem *wpg_addr;	// base addr + offset
-	unsigned long wpg_data;	// data to/from WPG LOHI format 
+	unsigned long wpg_data;	// data to/from WPG LOHI format
 	unsigned long ultemp;
 	unsigned long data;	// actual data HILO format
 	int i;
@@ -351,7 +351,7 @@ static u8 i2c_ctrl_write (struct controller *ctlr_ptr, void __iomem *WPGBbar, u8
 }
 
 //------------------------------------------------------------
-//  Read from ISA type HPC 
+//  Read from ISA type HPC
 //------------------------------------------------------------
 static u8 isa_ctrl_read (struct controller *ctlr_ptr, u8 offset)
 {
@@ -372,7 +372,7 @@ static void isa_ctrl_write (struct controller *ctlr_ptr, u8 offset, u8 data)
 {
 	u16 start_address;
 	u16 port_address;
-	
+
 	start_address = ctlr_ptr->u.isa_ctlr.io_start;
 	port_address = start_address + (u16) offset;
 	outb (data, port_address);
@@ -656,11 +656,11 @@ int ibmphp_hpc_readslot (struct slot * pslot, u8 cmd, u8 * pstatus)
 	//--------------------------------------------------------------------
 	// cleanup
 	//--------------------------------------------------------------------
-	
+
 	// remove physical to logical address mapping
 	if ((ctlr_ptr->ctlr_type == 2) || (ctlr_ptr->ctlr_type == 4))
 		iounmap (wpg_bbar);
-	
+
 	free_hpc_access ();
 
 	debug_polling ("%s - Exit rc[%d]\n", __func__, rc);
@@ -835,7 +835,7 @@ static int poll_hpc(void *data)
 		down (&semOperations);
 
 		switch (poll_state) {
-		case POLL_LATCH_REGISTER: 
+		case POLL_LATCH_REGISTER:
 			oldlatchlow = curlatchlow;
 			ctrl_count = 0x00;
 			list_for_each (pslotlist, &ibmphp_slot_head) {
@@ -892,16 +892,16 @@ static int poll_hpc(void *data)
 
 			if (kthread_should_stop())
 				goto out_sleep;
-			
+
 			down (&semOperations);
-			
+
 			if (poll_count >= POLL_LATCH_CNT) {
 				poll_count = 0;
 				poll_state = POLL_SLOTS;
 			} else
 				poll_state = POLL_LATCH_REGISTER;
 			break;
-		}	
+		}
 		/* give up the hardware semaphore */
 		up (&semOperations);
 		/* sleep for a short time just for good measure */
@@ -958,7 +958,7 @@ static int process_changeinstatus (struct slot *pslot, struct slot *poldslot)
 	// bit 5 - HPC_SLOT_PWRGD
 	if ((pslot->status & 0x20) != (poldslot->status & 0x20))
 		// OFF -> ON: ignore, ON -> OFF: disable slot
-		if ((poldslot->status & 0x20) && (SLOT_CONNECT (poldslot->status) == HPC_SLOT_CONNECTED) && (SLOT_PRESENT (poldslot->status))) 
+		if ((poldslot->status & 0x20) && (SLOT_CONNECT (poldslot->status) == HPC_SLOT_CONNECTED) && (SLOT_PRESENT (poldslot->status)))
 			disable = 1;
 
 	// bit 6 - HPC_SLOT_BUS_SPEED
@@ -980,7 +980,7 @@ static int process_changeinstatus (struct slot *pslot, struct slot *poldslot)
 					pslot->status &= ~HPC_SLOT_POWER;
 			}
 		}
-		// CLOSE -> OPEN 
+		// CLOSE -> OPEN
 		else if ((SLOT_PWRGD (poldslot->status) == HPC_SLOT_PWRGD_GOOD)
 			&& (SLOT_CONNECT (poldslot->status) == HPC_SLOT_CONNECTED) && (SLOT_PRESENT (poldslot->status))) {
 			disable = 1;
@@ -1075,7 +1075,7 @@ void __exit ibmphp_hpc_stop_poll_thread (void)
 	debug ("before locking operations \n");
 	ibmphp_lock_operations ();
 	debug ("after locking operations \n");
-	
+
 	// wait for poll thread to exit
 	debug ("before sem_exit down \n");
 	down (&sem_exit);
diff --git a/drivers/pci/hotplug/ibmphp_pci.c b/drivers/pci/hotplug/ibmphp_pci.c
index c60f5f3e838d..639ea3a75e14 100644
--- a/drivers/pci/hotplug/ibmphp_pci.c
+++ b/drivers/pci/hotplug/ibmphp_pci.c
@@ -1,8 +1,8 @@
 /*
  * IBM Hot Plug Controller Driver
- * 
+ *
  * Written By: Irene Zubarev, IBM Corporation
- * 
+ *
  * Copyright (C) 2001 Greg Kroah-Hartman (greg@kroah.com)
  * Copyright (C) 2001,2002 IBM Corp.
  *
@@ -42,7 +42,7 @@ static u8 find_sec_number (u8 primary_busno, u8 slotno);
 
 /*
  * NOTE..... If BIOS doesn't provide default routing, we assign:
- * 9 for SCSI, 10 for LAN adapters, and 11 for everything else. 
+ * 9 for SCSI, 10 for LAN adapters, and 11 for everything else.
  * If adapter is bridged, then we assign 11 to it and devices behind it.
  * We also assign the same irq numbers for multi function devices.
  * These are PIC mode, so shouldn't matter n.e.ways (hopefully)
@@ -71,11 +71,11 @@ static void assign_alt_irq (struct pci_func * cur_func, u8 class_code)
  * Configures the device to be added (will allocate needed resources if it
  * can), the device can be a bridge or a regular pci device, can also be
  * multi-functional
- * 
+ *
  * Input: function to be added
- * 
+ *
  * TO DO:  The error case with Multifunction device or multi function bridge,
- * if there is an error, will need to go through all previous functions and 
+ * if there is an error, will need to go through all previous functions and
  * unconfigure....or can add some code into unconfigure_card....
  */
 int ibmphp_configure_card (struct pci_func *func, u8 slotno)
@@ -98,7 +98,7 @@ int ibmphp_configure_card (struct pci_func *func, u8 slotno)
 	cur_func = func;
 
 	/* We only get bus and device from IRQ routing table.  So at this point,
-	 * func->busno is correct, and func->device contains only device (at the 5 
+	 * func->busno is correct, and func->device contains only device (at the 5
 	 * highest bits)
 	 */
 
@@ -151,7 +151,7 @@ int ibmphp_configure_card (struct pci_func *func, u8 slotno)
 						     cur_func->device, cur_func->busno);
 						cleanup_count = 6;
 						goto error;
-					}	
+					}
 					cur_func->next = NULL;
 					function = 0x8;
 					break;
@@ -339,7 +339,7 @@ error:
 }
 
 /*
- * This function configures the pci BARs of a single device.  
+ * This function configures the pci BARs of a single device.
  * Input: pointer to the pci_func
  * Output: configured PCI, 0, or error
  */
@@ -371,17 +371,17 @@ static int configure_device (struct pci_func *func)
 
 	for (count = 0; address[count]; count++) {	/* for 6 BARs */
 
-		/* not sure if i need this.  per scott, said maybe need smth like this
+		/* not sure if i need this.  per scott, said maybe need * something like this
 		   if devices don't adhere 100% to the spec, so don't want to write
 		   to the reserved bits
 
-		pcibios_read_config_byte(cur_func->busno, cur_func->device, 
+		pcibios_read_config_byte(cur_func->busno, cur_func->device,
 		PCI_BASE_ADDRESS_0 + 4 * count, &tmp);
 		if (tmp & 0x01) // IO
-			pcibios_write_config_dword(cur_func->busno, cur_func->device, 
+			pcibios_write_config_dword(cur_func->busno, cur_func->device,
 			PCI_BASE_ADDRESS_0 + 4 * count, 0xFFFFFFFD);
 		else  // Memory
-			pcibios_write_config_dword(cur_func->busno, cur_func->device, 
+			pcibios_write_config_dword(cur_func->busno, cur_func->device,
 			PCI_BASE_ADDRESS_0 + 4 * count, 0xFFFFFFFF);
 		 */
 		pci_bus_write_config_dword (ibmphp_pci_bus, devfn, address[count], 0xFFFFFFFF);
@@ -421,8 +421,8 @@ static int configure_device (struct pci_func *func)
 				return -EIO;
 			}
 			pci_bus_write_config_dword (ibmphp_pci_bus, devfn, address[count], func->io[count]->start);
-	
-			/* _______________This is for debugging purposes only_____________________ */ 
+
+			/* _______________This is for debugging purposes only_____________________ */
 			debug ("b4 writing, the IO address is %x\n", func->io[count]->start);
 			pci_bus_read_config_dword (ibmphp_pci_bus, devfn, address[count], &bar[count]);
 			debug ("after writing.... the start address is %x\n", bar[count]);
@@ -484,7 +484,7 @@ static int configure_device (struct pci_func *func)
 
 				pci_bus_write_config_dword (ibmphp_pci_bus, devfn, address[count], func->pfmem[count]->start);
 
-				/*_______________This is for debugging purposes only______________________________*/				
+				/*_______________This is for debugging purposes only______________________________*/
 				debug ("b4 writing, start address is %x\n", func->pfmem[count]->start);
 				pci_bus_read_config_dword (ibmphp_pci_bus, devfn, address[count], &bar[count]);
 				debug ("after writing, start address is %x\n", bar[count]);
@@ -559,7 +559,7 @@ static int configure_device (struct pci_func *func)
 /******************************************************************************
  * This routine configures a PCI-2-PCI bridge and the functions behind it
  * Parameters: pci_func
- * Returns: 
+ * Returns:
  ******************************************************************************/
 static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 {
@@ -622,7 +622,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 	debug ("AFTER FIND_SEC_NUMBER, func->busno IS %x\n", func->busno);
 
 	pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_SECONDARY_BUS, sec_number);
-	
+
 	/* __________________For debugging purposes only __________________________________
 	pci_bus_read_config_byte (ibmphp_pci_bus, devfn, PCI_SECONDARY_BUS, &sec_number);
 	debug ("sec_number after write/read is %x\n", sec_number);
@@ -644,7 +644,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 
 
 	/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-	   !!!!!!!!!!!!!!!NEED TO ADD!!!  FAST BACK-TO-BACK ENABLE!!!!!!!!!!!!!!!!!!!! 
+	   !!!!!!!!!!!!!!!NEED TO ADD!!!  FAST BACK-TO-BACK ENABLE!!!!!!!!!!!!!!!!!!!!
 	   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
 
 
@@ -670,7 +670,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 			debug ("len[count] in IO = %x\n", len[count]);
 
 			bus_io[count] = kzalloc(sizeof(struct resource_node), GFP_KERNEL);
-		
+
 			if (!bus_io[count]) {
 				err ("out of system memory\n");
 				retval = -ENOMEM;
@@ -735,7 +735,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 						ibmphp_add_pfmem_from_mem (bus_pfmem[count]);
 						func->pfmem[count] = bus_pfmem[count];
 					} else {
-						err ("cannot allocate requested pfmem for bus %x, device %x, len %x\n", 
+						err ("cannot allocate requested pfmem for bus %x, device %x, len %x\n",
 						     func->busno, func->device, len[count]);
 						kfree (mem_tmp);
 						kfree (bus_pfmem[count]);
@@ -805,7 +805,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 	debug ("amount_needed->mem = %x\n", amount_needed->mem);
 	debug ("amount_needed->pfmem =  %x\n", amount_needed->pfmem);
 
-	if (amount_needed->not_correct) {		
+	if (amount_needed->not_correct) {
 		debug ("amount_needed is not correct\n");
 		for (count = 0; address[count]; count++) {
 			/* for 2 BARs */
@@ -830,7 +830,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 	} else {
 		debug ("it wants %x IO behind the bridge\n", amount_needed->io);
 		io = kzalloc(sizeof(*io), GFP_KERNEL);
-		
+
 		if (!io) {
 			err ("out of system memory\n");
 			retval = -ENOMEM;
@@ -959,7 +959,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 
 		if (bus->noIORanges) {
 			pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_IO_BASE, 0x00 | bus->rangeIO->start >> 8);
-			pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_IO_LIMIT, 0x00 | bus->rangeIO->end >> 8);	
+			pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_IO_LIMIT, 0x00 | bus->rangeIO->end >> 8);
 
 			/* _______________This is for debugging purposes only ____________________
 			pci_bus_read_config_byte (ibmphp_pci_bus, devfn, PCI_IO_BASE, &temp);
@@ -980,7 +980,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 		if (bus->noMemRanges) {
 			pci_bus_write_config_word (ibmphp_pci_bus, devfn, PCI_MEMORY_BASE, 0x0000 | bus->rangeMem->start >> 16);
 			pci_bus_write_config_word (ibmphp_pci_bus, devfn, PCI_MEMORY_LIMIT, 0x0000 | bus->rangeMem->end >> 16);
-			
+
 			/* ____________________This is for debugging purposes only ________________________
 			pci_bus_read_config_word (ibmphp_pci_bus, devfn, PCI_MEMORY_BASE, &temp);
 			debug ("mem_base = %x\n", (temp & PCI_MEMORY_RANGE_TYPE_MASK) << 16);
@@ -1017,7 +1017,7 @@ static int configure_bridge (struct pci_func **func_passed, u8 slotno)
 		pci_bus_read_config_byte (ibmphp_pci_bus, devfn, PCI_INTERRUPT_PIN, &irq);
 		if ((irq > 0x00) && (irq < 0x05))
 			pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_INTERRUPT_LINE, func->irq[irq - 1]);
-		/*    
+		/*
 		pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_BRIDGE_CONTROL, ctrl);
 		pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_BRIDGE_CONTROL, PCI_BRIDGE_CTL_PARITY);
 		pci_bus_write_config_byte (ibmphp_pci_bus, devfn, PCI_BRIDGE_CONTROL, PCI_BRIDGE_CTL_SERR);
@@ -1071,7 +1071,7 @@ error:
  * This function adds up the amount of resources needed behind the PPB bridge
  * and passes it to the configure_bridge function
  * Input: bridge function
- * Ouput: amount of resources needed
+ * Output: amount of resources needed
  *****************************************************************************/
 static struct res_needed *scan_behind_bridge (struct pci_func * func, u8 busno)
 {
@@ -1204,9 +1204,9 @@ static struct res_needed *scan_behind_bridge (struct pci_func * func, u8 busno)
 	return amount;
 }
 
-/* The following 3 unconfigure_boot_ routines deal with the case when we had the card 
- * upon bootup in the system, since we don't allocate func to such case, we need to read 
- * the start addresses from pci config space and then find the corresponding entries in 
+/* The following 3 unconfigure_boot_ routines deal with the case when we had the card
+ * upon bootup in the system, since we don't allocate func to such case, we need to read
+ * the start addresses from pci config space and then find the corresponding entries in
  * our resource lists.  The functions return either 0, -ENODEV, or -1 (general failure)
  * Change: we also call these functions even if we configured the card ourselves (i.e., not
  * the bootup case), since it should work same way
@@ -1561,8 +1561,8 @@ static int unconfigure_boot_card (struct slot *slot_cur)
  * unconfiguring the device
  * TO DO:  will probably need to add some code in case there was some resource,
  * to remove it... this is from when we have errors in the configure_card...
- * 			!!!!!!!!!!!!!!!!!!!!!!!!!FOR BUSES!!!!!!!!!!!!
- * Returns: 0, -1, -ENODEV 
+ *			!!!!!!!!!!!!!!!!!!!!!!!!!FOR BUSES!!!!!!!!!!!!
+ * Returns: 0, -1, -ENODEV
  */
 int ibmphp_unconfigure_card (struct slot **slot_cur, int the_end)
 {
@@ -1634,7 +1634,7 @@ int ibmphp_unconfigure_card (struct slot **slot_cur, int the_end)
  * Input: bus and the amount of resources needed (we know we can assign those,
  *        since they've been checked already
  * Output: bus added to the correct spot
- *         0, -1, error 
+ *         0, -1, error
  */
 static int add_new_bus (struct bus_node *bus, struct resource_node *io, struct resource_node *mem, struct resource_node *pfmem, u8 parent_busno)
 {
@@ -1650,7 +1650,7 @@ static int add_new_bus (struct bus_node *bus, struct resource_node *io, struct r
 			err ("strange, cannot find bus which is supposed to be at the system... something is terribly wrong...\n");
 			return -ENODEV;
 		}
-	
+
 		list_add (&bus->bus_list, &cur_bus->bus_list);
 	}
 	if (io) {
@@ -1679,7 +1679,7 @@ static int add_new_bus (struct bus_node *bus, struct resource_node *io, struct r
 	}
 	if (pfmem) {
 		pfmem_range = kzalloc(sizeof(*pfmem_range), GFP_KERNEL);
-		if (!pfmem_range) {	
+		if (!pfmem_range) {
 			err ("out of system memory\n");
 			return -ENOMEM;
 		}
@@ -1726,4 +1726,3 @@ static u8 find_sec_number (u8 primary_busno, u8 slotno)
 		return busno;
 	return 0xff;
 }
-
diff --git a/drivers/pci/hotplug/ibmphp_res.c b/drivers/pci/hotplug/ibmphp_res.c
index e2dc289f767c..a265acb2d518 100644
--- a/drivers/pci/hotplug/ibmphp_res.c
+++ b/drivers/pci/hotplug/ibmphp_res.c
@@ -72,7 +72,7 @@ static struct bus_node * __init alloc_error_bus (struct ebda_pci_rsrc * curr, u8
 static struct resource_node * __init alloc_resources (struct ebda_pci_rsrc * curr)
 {
 	struct resource_node *rs;
-	
+
 	if (!curr) {
 		err ("NULL passed to allocate\n");
 		return NULL;
@@ -128,7 +128,7 @@ static int __init alloc_bus_range (struct bus_node **new_bus, struct range_node
 	}
 	newrange->start = curr->start_addr;
 	newrange->end = curr->end_addr;
-		
+
 	if (first_bus || (!num_ranges))
 		newrange->rangeno = 1;
 	else {
@@ -162,7 +162,7 @@ static int __init alloc_bus_range (struct bus_node **new_bus, struct range_node
 			newbus->rangePFMem = newrange;
 			if (first_bus)
 				newbus->noPFMemRanges = 1;
-			else {	
+			else {
 				debug ("1st PFMemory Primary on Bus %x [%x - %x]\n", newbus->busno, newrange->start, newrange->end);
 				++newbus->noPFMemRanges;
 				fix_resources (newbus);
@@ -190,7 +190,7 @@ static int __init alloc_bus_range (struct bus_node **new_bus, struct range_node
  * This is the Resource Management initialization function.  It will go through
  * the Resource list taken from EBDA and fill in this module's data structures
  *
- * THIS IS NOT TAKING INTO CONSIDERATION IO RESTRICTIONS OF PRIMARY BUSES, 
+ * THIS IS NOT TAKING INTO CONSIDERATION IO RESTRICTIONS OF PRIMARY BUSES,
  * SINCE WE'RE GOING TO ASSUME FOR NOW WE DON'T HAVE THOSE ON OUR BUSES FOR NOW
  *
  * Input: ptr to the head of the resource list from EBDA
@@ -382,7 +382,7 @@ int __init ibmphp_rsrc_init (void)
  * pci devices' resources for the appropriate resource
  *
  * Input: type of the resource, range to add, current bus
- * Output: 0 or -1, bus and range ptrs 
+ * Output: 0 or -1, bus and range ptrs
  ********************************************************************************/
 static int add_bus_range (int type, struct range_node *range, struct bus_node *bus_cur)
 {
@@ -466,7 +466,7 @@ static void update_resources (struct bus_node *bus_cur, int type, int rangeno)
 
 	switch (type) {
 		case MEM:
-			if (bus_cur->firstMem) 
+			if (bus_cur->firstMem)
 				res = bus_cur->firstMem;
 			break;
 		case PFMEM:
@@ -583,7 +583,7 @@ static void fix_resources (struct bus_node *bus_cur)
 }
 
 /*******************************************************************************
- * This routine adds a resource to the list of resources to the appropriate bus 
+ * This routine adds a resource to the list of resources to the appropriate bus
  * based on their resource type and sorted by their starting addresses.  It assigns
  * the ptrs to next and nextRange if needed.
  *
@@ -605,11 +605,11 @@ int ibmphp_add_resource (struct resource_node *res)
 		err ("NULL passed to add\n");
 		return -ENODEV;
 	}
-	
+
 	bus_cur = find_bus_wprev (res->busno, NULL, 0);
-	
+
 	if (!bus_cur) {
-		/* didn't find a bus, smth's wrong!!! */
+		/* didn't find a bus, something's wrong!!! */
 		debug ("no bus in the system, either pci_dev's wrong or allocation failed\n");
 		return -ENODEV;
 	}
@@ -648,7 +648,7 @@ int ibmphp_add_resource (struct resource_node *res)
 	if (!range_cur) {
 		switch (res->type) {
 			case IO:
-				++bus_cur->needIOUpdate;					
+				++bus_cur->needIOUpdate;
 				break;
 			case MEM:
 				++bus_cur->needMemUpdate;
@@ -659,13 +659,13 @@ int ibmphp_add_resource (struct resource_node *res)
 		}
 		res->rangeno = -1;
 	}
-	
+
 	debug ("The range is %d\n", res->rangeno);
 	if (!res_start) {
 		/* no first{IO,Mem,Pfmem} on the bus, 1st IO/Mem/Pfmem resource ever */
 		switch (res->type) {
 			case IO:
-				bus_cur->firstIO = res;					
+				bus_cur->firstIO = res;
 				break;
 			case MEM:
 				bus_cur->firstMem = res;
@@ -673,7 +673,7 @@ int ibmphp_add_resource (struct resource_node *res)
 			case PFMEM:
 				bus_cur->firstPFMem = res;
 				break;
-		}	
+		}
 		res->next = NULL;
 		res->nextRange = NULL;
 	} else {
@@ -770,7 +770,7 @@ int ibmphp_add_resource (struct resource_node *res)
  * This routine will remove the resource from the list of resources
  *
  * Input: io, mem, and/or pfmem resource to be deleted
- * Ouput: modified resource list
+ * Output: modified resource list
  *        0 or error code
  ****************************************************************************/
 int ibmphp_remove_resource (struct resource_node *res)
@@ -825,7 +825,7 @@ int ibmphp_remove_resource (struct resource_node *res)
 
 	if (!res_cur) {
 		if (res->type == PFMEM) {
-			/* 
+			/*
 			 * case where pfmem might be in the PFMemFromMem list
 			 * so will also need to remove the corresponding mem
 			 * entry
@@ -961,12 +961,12 @@ static struct range_node * find_range (struct bus_node *bus_cur, struct resource
 }
 
 /*****************************************************************************
- * This routine will check to make sure the io/mem/pfmem->len that the device asked for 
+ * This routine will check to make sure the io/mem/pfmem->len that the device asked for
  * can fit w/i our list of available IO/MEM/PFMEM resources.  If cannot, returns -EINVAL,
  * otherwise, returns 0
  *
  * Input: resource
- * Ouput: the correct start and end address are inputted into the resource node,
+ * Output: the correct start and end address are inputted into the resource node,
  *        0 or -EINVAL
  *****************************************************************************/
 int ibmphp_check_resource (struct resource_node *res, u8 bridge)
@@ -996,7 +996,7 @@ int ibmphp_check_resource (struct resource_node *res, u8 bridge)
 	bus_cur = find_bus_wprev (res->busno, NULL, 0);
 
 	if (!bus_cur) {
-		/* didn't find a bus, smth's wrong!!! */
+		/* didn't find a bus, something's wrong!!! */
 		debug ("no bus in the system, either pci_dev's wrong or allocation failed\n");
 		return -EINVAL;
 	}
@@ -1066,7 +1066,7 @@ int ibmphp_check_resource (struct resource_node *res, u8 bridge)
 								break;
 						}
 					}
-			
+
 					if (flag && len_cur == res->len) {
 						debug ("but we are not here, right?\n");
 						res->start = start_cur;
@@ -1118,10 +1118,10 @@ int ibmphp_check_resource (struct resource_node *res, u8 bridge)
 		if (res_prev) {
 			if (res_prev->rangeno != res_cur->rangeno) {
 				/* 1st device on this range */
-				if ((res_cur->start != range->start) && 
+				if ((res_cur->start != range->start) &&
 					((len_tmp = res_cur->start - 1 - range->start) >= res->len)) {
 					if ((len_tmp < len_cur) || (len_cur == 0)) {
-						if ((range->start % tmp_divide) == 0) {	
+						if ((range->start % tmp_divide) == 0) {
 							/* just perfect, starting address is divisible by length */
 							flag = 1;
 							len_cur = len_tmp;
@@ -1344,7 +1344,7 @@ int ibmphp_check_resource (struct resource_node *res, u8 bridge)
  * This routine is called from remove_card if the card contained PPB.
  * It will remove all the resources on the bus as well as the bus itself
  * Input: Bus
- * Ouput: 0, -ENODEV
+ * Output: 0, -ENODEV
  ********************************************************************************/
 int ibmphp_remove_bus (struct bus_node *bus, u8 parent_busno)
 {
@@ -1353,7 +1353,7 @@ int ibmphp_remove_bus (struct bus_node *bus, u8 parent_busno)
 	struct bus_node *prev_bus;
 	int rc;
 
-	prev_bus = find_bus_wprev (parent_busno, NULL, 0);	
+	prev_bus = find_bus_wprev (parent_busno, NULL, 0);
 
 	if (!prev_bus) {
 		debug ("something terribly wrong. Cannot find parent bus to the one to remove\n");
@@ -1424,7 +1424,7 @@ int ibmphp_remove_bus (struct bus_node *bus, u8 parent_busno)
 }
 
 /******************************************************************************
- * This routine deletes the ranges from a given bus, and the entries from the 
+ * This routine deletes the ranges from a given bus, and the entries from the
  * parent's bus in the resources
  * Input: current bus, previous bus
  * Output: 0, -EINVAL
@@ -1453,7 +1453,7 @@ static int remove_ranges (struct bus_node *bus_cur, struct bus_node *bus_prev)
 	if (bus_cur->noMemRanges) {
 		range_cur = bus_cur->rangeMem;
 		for (i = 0; i < bus_cur->noMemRanges; i++) {
-			if (ibmphp_find_resource (bus_prev, range_cur->start, &res, MEM) < 0) 
+			if (ibmphp_find_resource (bus_prev, range_cur->start, &res, MEM) < 0)
 				return -EINVAL;
 
 			ibmphp_remove_resource (res);
@@ -1467,7 +1467,7 @@ static int remove_ranges (struct bus_node *bus_cur, struct bus_node *bus_prev)
 	if (bus_cur->noPFMemRanges) {
 		range_cur = bus_cur->rangePFMem;
 		for (i = 0; i < bus_cur->noPFMemRanges; i++) {
-			if (ibmphp_find_resource (bus_prev, range_cur->start, &res, PFMEM) < 0) 
+			if (ibmphp_find_resource (bus_prev, range_cur->start, &res, PFMEM) < 0)
 				return -EINVAL;
 
 			ibmphp_remove_resource (res);
@@ -1482,7 +1482,7 @@ static int remove_ranges (struct bus_node *bus_cur, struct bus_node *bus_prev)
 }
 
 /*
- * find the resource node in the bus 
+ * find the resource node in the bus
  * Input: Resource needed, start address of the resource, type of resource
  */
 int ibmphp_find_resource (struct bus_node *bus, u32 start_address, struct resource_node **res, int flag)
@@ -1512,7 +1512,7 @@ int ibmphp_find_resource (struct bus_node *bus, u32 start_address, struct resour
 			err ("wrong type of flag\n");
 			return -EINVAL;
 	}
-	
+
 	while (res_cur) {
 		if (res_cur->start == start_address) {
 			*res = res_cur;
@@ -1718,7 +1718,7 @@ static int __init once_over (void)
 			}	/* end for pfmem */
 		}	/* end if */
 	}	/* end list_for_each bus */
-	return 0; 
+	return 0;
 }
 
 int ibmphp_add_pfmem_from_mem (struct resource_node *pfmem)
@@ -1760,9 +1760,9 @@ static struct bus_node *find_bus_wprev (u8 bus_number, struct bus_node **prev, u
 	list_for_each (tmp, &gbuses) {
 		tmp_prev = tmp->prev;
 		bus_cur = list_entry (tmp, struct bus_node, bus_list);
-		if (flag) 
+		if (flag)
 			*prev = list_entry (tmp_prev, struct bus_node, bus_list);
-		if (bus_cur->busno == bus_number) 
+		if (bus_cur->busno == bus_number)
 			return bus_cur;
 	}
 
@@ -1776,7 +1776,7 @@ void ibmphp_print_test (void)
 	struct range_node *range;
 	struct resource_node *res;
 	struct list_head *tmp;
-	
+
 	debug_pci ("*****************START**********************\n");
 
 	if ((!list_empty(&gbuses)) && flags) {
@@ -1906,7 +1906,7 @@ static int range_exists_already (struct range_node * range, struct bus_node * bu
 			return 1;
 		range_cur = range_cur->next;
 	}
-	
+
 	return 0;
 }
 
@@ -1920,7 +1920,7 @@ static int range_exists_already (struct range_node * range, struct bus_node * bu
  * Returns: none
  * Note: this function doesn't take into account IO restrictions etc,
  *	 so will only work for bridges with no video/ISA devices behind them It
- *	 also will not work for onboard PPB's that can have more than 1 *bus
+ *	 also will not work for onboard PPBs that can have more than 1 *bus
  *	 behind them All these are TO DO.
  *	 Also need to add more error checkings... (from fnc returns etc)
  */
@@ -1963,7 +1963,7 @@ static int __init update_bridge_ranges (struct bus_node **bus)
 					case PCI_HEADER_TYPE_BRIDGE:
 						function = 0x8;
 					case PCI_HEADER_TYPE_MULTIBRIDGE:
-						/* We assume here that only 1 bus behind the bridge 
+						/* We assume here that only 1 bus behind the bridge
 						   TO DO: add functionality for several:
 						   temp = secondary;
 						   while (temp < subordinate) {
@@ -1972,7 +1972,7 @@ static int __init update_bridge_ranges (struct bus_node **bus)
 						   }
 						 */
 						pci_bus_read_config_byte (ibmphp_pci_bus, devfn, PCI_SECONDARY_BUS, &sec_busno);
-						bus_sec = find_bus_wprev (sec_busno, NULL, 0); 
+						bus_sec = find_bus_wprev (sec_busno, NULL, 0);
 						/* this bus structure doesn't exist yet, PPB was configured during previous loading of ibmphp */
 						if (!bus_sec) {
 							bus_sec = alloc_error_bus (NULL, sec_busno, 1);
@@ -2028,7 +2028,7 @@ static int __init update_bridge_ranges (struct bus_node **bus)
 								io->len = io->end - io->start + 1;
 								ibmphp_add_resource (io);
 							}
-						}	
+						}
 
 						pci_bus_read_config_word (ibmphp_pci_bus, devfn, PCI_MEMORY_BASE, &start_mem_address);
 						pci_bus_read_config_word (ibmphp_pci_bus, devfn, PCI_MEMORY_LIMIT, &end_mem_address);
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index ec20f74c8981..cfa92a984e62 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -131,7 +131,7 @@ static ssize_t power_write_file(struct pci_slot *pci_slot, const char *buf,
 	}
 	module_put(slot->ops->owner);
 
-exit:	
+exit:
 	if (retval)
 		return retval;
 	return count;
@@ -177,7 +177,7 @@ static ssize_t attention_write_file(struct pci_slot *slot, const char *buf,
 		retval = ops->set_attention_status(slot->hotplug, attention);
 	module_put(ops->owner);
 
-exit:	
+exit:
 	if (retval)
 		return retval;
 	return count;
@@ -247,7 +247,7 @@ static ssize_t test_write_file(struct pci_slot *pci_slot, const char *buf,
 		retval = slot->ops->hardware_test(slot, test);
 	module_put(slot->ops->owner);
 
-exit:	
+exit:
 	if (retval)
 		return retval;
 	return count;
@@ -512,7 +512,7 @@ int pci_hp_deregister(struct hotplug_slot *hotplug)
  * @hotplug: pointer to the slot whose info has changed
  * @info: pointer to the info copy into the slot's info structure
  *
- * @slot must have been registered with the pci 
+ * @slot must have been registered with the pci
  * hotplug subsystem previously with a call to pci_hp_register().
  *
  * Returns 0 if successful, anything else for an error.
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 541bbe6d5343..21e865ded1dc 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -180,5 +180,5 @@ static inline int pciehp_acpi_slot_detection_check(struct pci_dev *dev)
 {
 	return 0;
 }
-#endif 				/* CONFIG_ACPI */
+#endif				/* CONFIG_ACPI */
 #endif				/* _PCIEHP_H */
diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index ead7c534095e..55ea2a07f266 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -78,7 +78,7 @@ static int __initdata dup_slot_id;
 static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
 
-/* Dummy driver for dumplicate name detection */
+/* Dummy driver for duplicate name detection */
 static int __init dummy_probe(struct pcie_device *dev)
 {
 	u32 slot_cap;
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index f4a18f51a29c..bbd48bbe4e9b 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -351,8 +351,8 @@ static int __init pcied_init(void)
 
 	pciehp_firmware_init();
 	retval = pcie_port_service_register(&hpdriver_portdrv);
- 	dbg("pcie_port_service_register = %d\n", retval);
-  	info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+	dbg("pcie_port_service_register = %d\n", retval);
+	info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 	if (retval)
 		dbg("Failure to register service\n");
 
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 51f56ef4ab6f..3eea3fdd4b0b 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -92,7 +92,7 @@ static void start_int_poll_timer(struct controller *ctrl, int sec)
 {
 	/* Clamp to sane value */
 	if ((sec <= 0) || (sec > 60))
-        	sec = 2;
+		sec = 2;
 
 	ctrl->poll_timer.function = &int_poll_timeout;
 	ctrl->poll_timer.data = (unsigned long)ctrl;
@@ -194,7 +194,7 @@ static int pcie_write_cmd(struct controller *ctrl, u16 cmd, u16 mask)
 			ctrl_dbg(ctrl, "CMD_COMPLETED not clear after 1 sec\n");
 		} else if (!NO_CMD_CMPL(ctrl)) {
 			/*
-			 * This controller semms to notify of command completed
+			 * This controller seems to notify of command completed
 			 * event even though it supports none of power
 			 * controller, attention led, power led and EMI.
 			 */
@@ -926,7 +926,7 @@ struct controller *pcie_init(struct pcie_device *dev)
 	if (pciehp_writew(ctrl, PCI_EXP_SLTSTA, 0x1f))
 		goto abort_ctrl;
 
-	/* Disable sotfware notification */
+	/* Disable software notification */
 	pcie_disable_notification(ctrl);
 
 	ctrl_info(ctrl, "HPC vendor_id %x device_id %x ss_vid %x ss_did %x\n",
diff --git a/drivers/pci/hotplug/pcihp_skeleton.c b/drivers/pci/hotplug/pcihp_skeleton.c
index 1f00b937f721..ac69094e4b20 100644
--- a/drivers/pci/hotplug/pcihp_skeleton.c
+++ b/drivers/pci/hotplug/pcihp_skeleton.c
@@ -52,7 +52,7 @@ static LIST_HEAD(slot_list);
 	do {							\
 		if (debug)					\
 			printk (KERN_DEBUG "%s: " format "\n",	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while (0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format "\n", MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format "\n", MY_NAME , ## arg)
@@ -287,7 +287,7 @@ static int __init init_slots(void)
 		hotplug_slot->release = &release_slot;
 		make_slot_name(slot);
 		hotplug_slot->ops = &skel_hotplug_slot_ops;
-		
+
 		/*
 		 * Initialize the slot info structure with some known
 		 * good values.
@@ -296,7 +296,7 @@ static int __init init_slots(void)
 		get_attention_status(hotplug_slot, &info->attention_status);
 		get_latch_status(hotplug_slot, &info->latch_status);
 		get_adapter_status(hotplug_slot, &info->adapter_status);
-		
+
 		dbg("registering slot %d\n", i);
 		retval = pci_hp_register(slot->hotplug_slot);
 		if (retval) {
@@ -336,7 +336,7 @@ static void __exit cleanup_slots(void)
 		pci_hp_deregister(slot->hotplug_slot);
 	}
 }
-		
+
 static int __init pcihp_skel_init(void)
 {
 	int retval;
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index bb7af78e4eed..e9c044d15add 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -217,7 +217,7 @@ static int dlpar_remove_phb(char *drc_name, struct device_node *dn)
 	if (!pcibios_find_pci_bus(dn))
 		return -EINVAL;
 
-	/* If pci slot is hotplugable, use hotplug to remove it */
+	/* If pci slot is hotpluggable, use hotplug to remove it */
 	slot = find_php_slot(dn);
 	if (slot && rpaphp_deregister_slot(slot)) {
 		printk(KERN_ERR "%s: unable to remove hotplug slot %s\n",
diff --git a/drivers/pci/hotplug/rpaphp.h b/drivers/pci/hotplug/rpaphp.h
index 3135856e5e1c..b2593e876a09 100644
--- a/drivers/pci/hotplug/rpaphp.h
+++ b/drivers/pci/hotplug/rpaphp.h
@@ -49,9 +49,9 @@
 extern bool rpaphp_debug;
 #define dbg(format, arg...)					\
 	do {							\
-		if (rpaphp_debug)					\
+		if (rpaphp_debug)				\
 			printk(KERN_DEBUG "%s: " format,	\
-				MY_NAME , ## arg); 		\
+				MY_NAME , ## arg);		\
 	} while (0)
 #define err(format, arg...) printk(KERN_ERR "%s: " format, MY_NAME , ## arg)
 #define info(format, arg...) printk(KERN_INFO "%s: " format, MY_NAME , ## arg)
@@ -99,5 +99,5 @@ void dealloc_slot_struct(struct slot *slot);
 struct slot *alloc_slot_struct(struct device_node *dn, int drc_index, char *drc_name, int power_domain);
 int rpaphp_register_slot(struct slot *slot);
 int rpaphp_deregister_slot(struct slot *slot);
-	
+
 #endif				/* _PPC64PHP_H */
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index 127d6e600185..b7fc5c9255a5 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -226,7 +226,7 @@ int rpaphp_get_drc_props(struct device_node *dn, int *drc_index,
 	for (i = 0; i < indexes[0]; i++) {
 		if ((unsigned int) indexes[i + 1] == *my_index) {
 			if (drc_name)
-                		*drc_name = name_tmp;
+				*drc_name = name_tmp;
 			if (drc_type)
 				*drc_type = type_tmp;
 			if (drc_index)
@@ -289,7 +289,7 @@ static int is_php_dn(struct device_node *dn, const int **indexes,
  * rpaphp_add_slot -- declare a hotplug slot to the hotplug subsystem.
  * @dn: device node of slot
  *
- * This subroutine will register a hotplugable slot with the
+ * This subroutine will register a hotpluggable slot with the
  * PCI hotplug infrastructure. This routine is typically called
  * during boot time, if the hotplug slots are present at boot time,
  * or is called later, by the dlpar add code, if the slot is
@@ -328,7 +328,7 @@ int rpaphp_add_slot(struct device_node *dn)
 			return -ENOMEM;
 
 		slot->type = simple_strtoul(type, NULL, 10);
-				
+
 		dbg("Found drc-index:0x%x drc-name:%s drc-type:%s\n",
 				indexes[i + 1], name, type);
 
@@ -356,7 +356,7 @@ static void __exit cleanup_slots(void)
 	/*
 	 * Unregister all of our slots with the pci_hotplug subsystem,
 	 * and free up all memory that we had allocated.
-	 * memory will be freed in release_slot callback. 
+	 * memory will be freed in release_slot callback.
 	 */
 
 	list_for_each_safe(tmp, n, &rpaphp_slot_head) {
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index 513e1e282391..9243f3e7a1c9 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -44,7 +44,7 @@ int rpaphp_get_sensor_state(struct slot *slot, int *state)
 			dbg("%s: slot must be power up to get sensor-state\n",
 			    __func__);
 
-			/* some slots have to be powered up 
+			/* some slots have to be powered up
 			 * before get-sensor will succeed.
 			 */
 			rc = rtas_set_power_level(slot->power_domain, POWER_ON,
@@ -133,4 +133,3 @@ int rpaphp_enable_slot(struct slot *slot)
 
 	return 0;
 }
-
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index b283bbea6d24..a6082cc263f7 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -1,5 +1,5 @@
 /*
- * RPA Virtual I/O device functions 
+ * RPA Virtual I/O device functions
  * Copyright (C) 2004 Linda Xie <lxie@us.ibm.com>
  *
  * All rights reserved.
@@ -51,27 +51,27 @@ struct slot *alloc_slot_struct(struct device_node *dn,
                        int drc_index, char *drc_name, int power_domain)
 {
 	struct slot *slot;
-	
+
 	slot = kzalloc(sizeof(struct slot), GFP_KERNEL);
 	if (!slot)
 		goto error_nomem;
 	slot->hotplug_slot = kzalloc(sizeof(struct hotplug_slot), GFP_KERNEL);
 	if (!slot->hotplug_slot)
-		goto error_slot;	
+		goto error_slot;
 	slot->hotplug_slot->info = kzalloc(sizeof(struct hotplug_slot_info),
 					   GFP_KERNEL);
 	if (!slot->hotplug_slot->info)
 		goto error_hpslot;
 	slot->name = kstrdup(drc_name, GFP_KERNEL);
 	if (!slot->name)
-		goto error_info;	
+		goto error_info;
 	slot->dn = dn;
 	slot->index = drc_index;
 	slot->power_domain = power_domain;
 	slot->hotplug_slot->private = slot;
 	slot->hotplug_slot->ops = &rpaphp_hotplug_slot_ops;
 	slot->hotplug_slot->release = &rpaphp_release_slot;
-	
+
 	return (slot);
 
 error_info:
@@ -91,7 +91,7 @@ static int is_registered(struct slot *slot)
 	list_for_each_entry(tmp_slot, &rpaphp_slot_head, rpaphp_slot_list) {
 		if (!strcmp(tmp_slot->name, slot->name))
 			return 1;
-	}	
+	}
 	return 0;
 }
 
@@ -104,7 +104,7 @@ int rpaphp_deregister_slot(struct slot *slot)
 		__func__, slot->name);
 
 	list_del(&slot->rpaphp_slot_list);
-	
+
 	retval = pci_hp_deregister(php_slot);
 	if (retval)
 		err("Problem unregistering a slot %s\n", slot->name);
@@ -120,7 +120,7 @@ int rpaphp_register_slot(struct slot *slot)
 	int retval;
 	int slotno;
 
-	dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] type[%d]\n", 
+	dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] type[%d]\n",
 		__func__, slot->dn->full_name, slot->index, slot->name,
 		slot->power_domain, slot->type);
 
@@ -128,7 +128,7 @@ int rpaphp_register_slot(struct slot *slot)
 	if (is_registered(slot)) {
 		err("rpaphp_register_slot: slot[%s] is already registered\n", slot->name);
 		return -EAGAIN;
-	}	
+	}
 
 	if (slot->dn->child)
 		slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
@@ -145,4 +145,3 @@ int rpaphp_register_slot(struct slot *slot)
 	info("Slot [%s] registered\n", slot->name);
 	return 0;
 }
-
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index d876e4b3c6a9..61529097464d 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -216,13 +216,13 @@ struct ctrl_reg {
 
 /* offsets to the controller registers based on the above structure layout */
 enum ctrl_offsets {
-	BASE_OFFSET 	 = offsetof(struct ctrl_reg, base_offset),
-	SLOT_AVAIL1 	 = offsetof(struct ctrl_reg, slot_avail1),
+	BASE_OFFSET	 = offsetof(struct ctrl_reg, base_offset),
+	SLOT_AVAIL1	 = offsetof(struct ctrl_reg, slot_avail1),
 	SLOT_AVAIL2	 = offsetof(struct ctrl_reg, slot_avail2),
-	SLOT_CONFIG 	 = offsetof(struct ctrl_reg, slot_config),
+	SLOT_CONFIG	 = offsetof(struct ctrl_reg, slot_config),
 	SEC_BUS_CONFIG	 = offsetof(struct ctrl_reg, sec_bus_config),
 	MSI_CTRL	 = offsetof(struct ctrl_reg, msi_ctrl),
-	PROG_INTERFACE 	 = offsetof(struct ctrl_reg, prog_interface),
+	PROG_INTERFACE	 = offsetof(struct ctrl_reg, prog_interface),
 	CMD		 = offsetof(struct ctrl_reg, cmd),
 	CMD_STATUS	 = offsetof(struct ctrl_reg, cmd_status),
 	INTR_LOC	 = offsetof(struct ctrl_reg, intr_loc),
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index d3f757df691c..faf13abd5b99 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -143,11 +143,11 @@ static int init_slots(struct controller *ctrl)
 		snprintf(name, SLOT_NAME_SIZE, "%d", slot->number);
 		hotplug_slot->ops = &shpchp_hotplug_slot_ops;
 
- 		ctrl_dbg(ctrl, "Registering domain:bus:dev=%04x:%02x:%02x "
- 			 "hp_slot=%x sun=%x slot_device_offset=%x\n",
- 			 pci_domain_nr(ctrl->pci_dev->subordinate),
- 			 slot->bus, slot->device, slot->hp_slot, slot->number,
- 			 ctrl->slot_device_offset);
+		ctrl_dbg(ctrl, "Registering domain:bus:dev=%04x:%02x:%02x "
+			 "hp_slot=%x sun=%x slot_device_offset=%x\n",
+			 pci_domain_nr(ctrl->pci_dev->subordinate),
+			 slot->bus, slot->device, slot->hp_slot, slot->number,
+			 ctrl->slot_device_offset);
 		retval = pci_hp_register(slot->hotplug_slot,
 				ctrl->pci_dev->subordinate, slot->device, name);
 		if (retval) {
diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c
index 75ba2311b54f..2d7f474ca0ec 100644
--- a/drivers/pci/hotplug/shpchp_hpc.c
+++ b/drivers/pci/hotplug/shpchp_hpc.c
@@ -116,7 +116,7 @@
 #define SLOT_REG_RSVDZ_MASK	((1 << 15) | (7 << 21))
 
 /*
- * SHPC Command Code definitnions
+ * SHPC Command Code definitions
  *
  *     Slot Operation				00h - 3Fh
  *     Set Bus Segment Speed/Mode A		40h - 47h
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 21a7182dccd4..1fe2d6fb19d5 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -610,7 +610,7 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
 	struct resource tmp;
 	enum pci_bar_type type;
 	int reg = pci_iov_resource_bar(dev, resno, &type);
-	
+
 	if (!reg)
 		return 0;
 
diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c
index b008cf86b9c3..6684f153ab57 100644
--- a/drivers/pci/irq.c
+++ b/drivers/pci/irq.c
@@ -25,7 +25,7 @@ static void pci_note_irq_problem(struct pci_dev *pdev, const char *reason)
 /**
  * pci_lost_interrupt - reports a lost PCI interrupt
  * @pdev:	device whose interrupt is lost
- * 
+ *
  * The primary function of this routine is to report a lost interrupt
  * in a standard way which users can recognise (instead of blaming the
  * driver).
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5e63645a7abe..3fcd67a16677 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -784,7 +784,7 @@ error:
  * @nvec: how many MSIs have been requested ?
  * @type: are we checking for MSI or MSI-X ?
  *
- * Look at global flags, the device itself, and its parent busses
+ * Look at global flags, the device itself, and its parent buses
  * to determine if MSI/-X are supported for the device. If MSI/-X is
  * supported return 0, else return an error code.
  **/
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index dfd1f59de729..92a49c2b1fda 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -141,7 +141,7 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle)
  * if (_PRW at S-state x)
  *	choose from highest power _SxD to lowest power _SxW
  * else // no _PRW at S-state x
- * 	choose highest power _SxD or any lower power
+ *	choose highest power _SxD or any lower power
  */
 
 static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 454853507b7e..9042fdbd7244 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -312,7 +312,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
  * __pci_device_probe - check if a driver wants to claim a specific PCI device
  * @drv: driver to call to check if it wants the PCI device
  * @pci_dev: PCI device being probed
- * 
+ *
  * returns 0 on success, else error.
  * side-effect: pci_dev->driver is set to drv when drv claims pci_dev.
  */
@@ -378,7 +378,7 @@ static int pci_device_remove(struct device * dev)
 	 * We would love to complain here if pci_dev->is_enabled is set, that
 	 * the driver should have called pci_disable_device(), but the
 	 * unfortunate fact is there are too many odd BIOS and bridge setups
-	 * that don't like drivers doing that all of the time.  
+	 * that don't like drivers doing that all of the time.
 	 * Oh well, we can dream of sane hardware when we sleep, no matter how
 	 * horrible the crap we have to deal with is when we are awake...
 	 */
@@ -1156,10 +1156,10 @@ static const struct dev_pm_ops pci_dev_pm_ops = {
  * @drv: the driver structure to register
  * @owner: owner module of drv
  * @mod_name: module name string
- * 
+ *
  * Adds the driver structure to the list of registered drivers.
- * Returns a negative value on error, otherwise 0. 
- * If no error occurred, the driver remains registered even if 
+ * Returns a negative value on error, otherwise 0.
+ * If no error occurred, the driver remains registered even if
  * no device was claimed during registration.
  */
 int __pci_register_driver(struct pci_driver *drv, struct module *owner,
@@ -1181,7 +1181,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner,
 /**
  * pci_unregister_driver - unregister a pci driver
  * @drv: the driver structure to unregister
- * 
+ *
  * Deletes the driver structure from the list of registered PCI drivers,
  * gives it a chance to clean up by calling its remove() function for
  * each device it was responsible for, and marks those devices as
@@ -1203,7 +1203,7 @@ static struct pci_driver pci_compat_driver = {
  * pci_dev_driver - get the pci_driver of a device
  * @dev: the device to query
  *
- * Returns the appropriate pci_driver structure or %NULL if there is no 
+ * Returns the appropriate pci_driver structure or %NULL if there is no
  * registered driver for the device.
  */
 struct pci_driver *
@@ -1224,7 +1224,7 @@ pci_dev_driver(const struct pci_dev *dev)
  * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure
  * @dev: the PCI device structure to match against
  * @drv: the device driver to search for matching PCI device id structures
- * 
+ *
  * Used by a driver to check whether a PCI device present in the
  * system is in its list of supported devices. Returns the matching
  * pci_device_id structure or %NULL if there is no match.
diff --git a/drivers/pci/pci-stub.c b/drivers/pci/pci-stub.c
index 6e47c519c510..2ff77509d8e5 100644
--- a/drivers/pci/pci-stub.c
+++ b/drivers/pci/pci-stub.c
@@ -2,13 +2,13 @@
  *
  * Copyright (C) 2008 Red Hat, Inc.
  * Author:
- * 	Chris Wright
+ *	Chris Wright
  *
  * This work is licensed under the terms of the GNU GPL, version 2.
  *
  * Usage is simple, allocate a new id to the stub driver and bind the
  * device to it.  For example:
- * 
+ *
  * # echo "8086 10f5" > /sys/bus/pci/drivers/pci-stub/new_id
  * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/e1000e/unbind
  * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/pci-stub/bind
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 2aaa83c85a4e..c91e6c18debc 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -10,7 +10,7 @@
  *
  * File attributes for PCI devices
  *
- * Modeled after usb's driverfs.c 
+ * Modeled after usb's driverfs.c
  *
  */
 
@@ -270,13 +270,17 @@ msi_bus_store(struct device *dev, struct device_attribute *attr,
 	if (kstrtoul(buf, 0, &val) < 0)
 		return -EINVAL;
 
-	/* bad things may happen if the no_msi flag is changed
-	 * while some drivers are loaded */
+	/*
+	 * Bad things may happen if the no_msi flag is changed
+	 * while drivers are loaded.
+	 */
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	/* Maybe pci devices without subordinate busses shouldn't even have this
-	 * attribute in the first place?  */
+	/*
+	 * Maybe devices without subordinate buses shouldn't have this
+	 * attribute in the first place?
+	 */
 	if (!pdev->subordinate)
 		return count;
 
@@ -670,7 +674,7 @@ pci_write_config(struct file* filp, struct kobject *kobj,
 		size = dev->cfg_size - off;
 		count = size;
 	}
-	
+
 	pci_config_pm_runtime_get(dev);
 
 	if ((off & 1) && size) {
@@ -678,7 +682,7 @@ pci_write_config(struct file* filp, struct kobject *kobj,
 		off++;
 		size--;
 	}
-	
+
 	if ((off & 3) && size > 2) {
 		u16 val = data[off - init_off];
 		val |= (u16) data[off - init_off + 1] << 8;
@@ -696,7 +700,7 @@ pci_write_config(struct file* filp, struct kobject *kobj,
 		off += 4;
 		size -= 4;
 	}
-	
+
 	if (size >= 2) {
 		u16 val = data[off - init_off];
 		val |= (u16) data[off - init_off + 1] << 8;
@@ -1229,21 +1233,21 @@ pci_read_rom(struct file *filp, struct kobject *kobj,
 
 	if (!pdev->rom_attr_enabled)
 		return -EINVAL;
-	
+
 	rom = pci_map_rom(pdev, &size);	/* size starts out as PCI window size */
 	if (!rom || !size)
 		return -EIO;
-		
+
 	if (off >= size)
 		count = 0;
 	else {
 		if (off + count > size)
 			count = size - off;
-		
+
 		memcpy_fromio(buf, rom + off, count);
 	}
 	pci_unmap_rom(pdev, rom);
-		
+
 	return count;
 }
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b127fbda6fc8..33120d156668 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -198,7 +198,7 @@ static int __pci_bus_find_cap_start(struct pci_bus *bus,
 }
 
 /**
- * pci_find_capability - query for devices' capabilities 
+ * pci_find_capability - query for devices' capabilities
  * @dev: PCI device to query
  * @cap: capability code
  *
@@ -207,12 +207,12 @@ static int __pci_bus_find_cap_start(struct pci_bus *bus,
  * device's PCI configuration space or 0 in case the device does not
  * support it.  Possible values for @cap:
  *
- *  %PCI_CAP_ID_PM           Power Management 
- *  %PCI_CAP_ID_AGP          Accelerated Graphics Port 
- *  %PCI_CAP_ID_VPD          Vital Product Data 
- *  %PCI_CAP_ID_SLOTID       Slot Identification 
+ *  %PCI_CAP_ID_PM           Power Management
+ *  %PCI_CAP_ID_AGP          Accelerated Graphics Port
+ *  %PCI_CAP_ID_VPD          Vital Product Data
+ *  %PCI_CAP_ID_SLOTID       Slot Identification
  *  %PCI_CAP_ID_MSI          Message Signalled Interrupts
- *  %PCI_CAP_ID_CHSWP        CompactPCI HotSwap 
+ *  %PCI_CAP_ID_CHSWP        CompactPCI HotSwap
  *  %PCI_CAP_ID_PCIX         PCI-X
  *  %PCI_CAP_ID_EXP          PCI Express
  */
@@ -228,13 +228,13 @@ int pci_find_capability(struct pci_dev *dev, int cap)
 }
 
 /**
- * pci_bus_find_capability - query for devices' capabilities 
+ * pci_bus_find_capability - query for devices' capabilities
  * @bus:   the PCI bus to query
  * @devfn: PCI device to query
  * @cap:   capability code
  *
  * Like pci_find_capability() but works for pci devices that do not have a
- * pci_dev structure set up yet. 
+ * pci_dev structure set up yet.
  *
  * Returns the address of the requested capability structure within the
  * device's PCI configuration space or 0 in case the device does not
@@ -515,7 +515,7 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
 		return -EINVAL;
 
 	/* Validate current state:
-	 * Can enter D0 from any state, but if we can only go deeper 
+	 * Can enter D0 from any state, but if we can only go deeper
 	 * to sleep if we're already in a low power state
 	 */
 	if (state != PCI_D0 && dev->current_state <= PCI_D3cold
@@ -998,7 +998,7 @@ static void pci_restore_config_space(struct pci_dev *pdev)
 	}
 }
 
-/** 
+/**
  * pci_restore_state - Restore the saved state of a PCI device
  * @dev: - PCI device that we're dealing with
  */
@@ -1030,7 +1030,7 @@ struct pci_saved_state {
  *			   the device saved state.
  * @dev: PCI device that we're dealing with
  *
- * Rerturn NULL if no state or error.
+ * Return NULL if no state or error.
  */
 struct pci_saved_state *pci_store_saved_state(struct pci_dev *dev)
 {
@@ -1880,7 +1880,7 @@ int pci_finish_runtime_suspend(struct pci_dev *dev)
  * pci_dev_run_wake - Check if device can generate run-time wake-up events.
  * @dev: Device to check.
  *
- * Return true if the device itself is cabable of generating wake-up events
+ * Return true if the device itself is capable of generating wake-up events
  * (through the platform or using the native PCIe PME) or if the device supports
  * PME and one of its upstream bridges can generate wake-up events.
  */
@@ -2447,7 +2447,7 @@ bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags)
 	switch (pci_pcie_type(pdev)) {
 	/*
 	 * PCI/X-to-PCIe bridges are not specifically mentioned by the spec,
-	 * but since their primary inteface is PCI/X, we conservatively
+	 * but since their primary interface is PCI/X, we conservatively
 	 * handle them as we would a non-PCIe device.
 	 */
 	case PCI_EXP_TYPE_PCIE_BRIDGE:
@@ -2471,7 +2471,7 @@ bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags)
 	/*
 	 * PCIe 3.0, 6.12.1.2 specifies ACS capabilities that should be
 	 * implemented by the remaining PCIe types to indicate peer-to-peer
-	 * capabilities, but only when they are part of a multifunciton
+	 * capabilities, but only when they are part of a multifunction
 	 * device.  The footnote for section 6.12 indicates the specific
 	 * PCIe types included here.
 	 */
@@ -2486,7 +2486,7 @@ bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags)
 	}
 
 	/*
-	 * PCIe 3.0, 6.12.1.3 specifies no ACS capabilties are applicable
+	 * PCIe 3.0, 6.12.1.3 specifies no ACS capabilities are applicable
 	 * to single function devices with the exception of downstream ports.
 	 */
 	return true;
@@ -2622,7 +2622,7 @@ void pci_release_region(struct pci_dev *pdev, int bar)
  *
  *	If @exclusive is set, then the region is marked so that userspace
  *	is explicitly not allowed to map the resource via /dev/mem or
- * 	sysfs MMIO access.
+ *	sysfs MMIO access.
  *
  *	Returns 0 on success, or %EBUSY on error.  A warning
  *	message is also printed on failure.
@@ -2634,7 +2634,7 @@ static int __pci_request_region(struct pci_dev *pdev, int bar, const char *res_n
 
 	if (pci_resource_len(pdev, bar) == 0)
 		return 0;
-		
+
 	if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) {
 		if (!request_region(pci_resource_start(pdev, bar),
 			    pci_resource_len(pdev, bar), res_name))
@@ -2694,7 +2694,7 @@ int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
  *
  *	The key difference that _exclusive makes it that userspace is
  *	explicitly not allowed to map the resource via /dev/mem or
- * 	sysfs.
+ *	sysfs.
  */
 int pci_request_region_exclusive(struct pci_dev *pdev, int bar, const char *res_name)
 {
@@ -2799,7 +2799,7 @@ int pci_request_regions(struct pci_dev *pdev, const char *res_name)
  *	successfully.
  *
  *	pci_request_regions_exclusive() will mark the region so that
- * 	/dev/mem and the sysfs MMIO access will not be allowed.
+ *	/dev/mem and the sysfs MMIO access will not be allowed.
  *
  *	Returns 0 on success, or %EBUSY on error.  A warning
  *	message is also printed on failure.
@@ -2967,7 +2967,7 @@ pci_set_mwi(struct pci_dev *dev)
 		cmd |= PCI_COMMAND_INVALIDATE;
 		pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
-	
+
 	return 0;
 }
 
@@ -3292,7 +3292,7 @@ clear:
  *
  * NOTE: This causes the caller to sleep for twice the device power transition
  * cooldown period, which for the D0->D3hot and D3hot->D0 transitions is 10 ms
- * by devault (i.e. unless the @dev's d3_delay field has a different value).
+ * by default (i.e. unless the @dev's d3_delay field has a different value).
  * Moreover, only devices in D0 can be reset by this function.
  */
 static int pci_pm_reset(struct pci_dev *dev, int probe)
@@ -3341,7 +3341,7 @@ void pci_reset_bridge_secondary_bus(struct pci_dev *dev)
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
 	/*
 	 * PCI spec v3.0 7.6.4.2 requires minimum Trst of 1ms.  Double
-	 * this to 2ms to ensure that we meet the minium requirement.
+	 * this to 2ms to ensure that we meet the minimum requirement.
 	 */
 	msleep(2);
 
@@ -3998,7 +3998,7 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 		return -EINVAL;
 
 	v = ffs(mps) - 8;
-	if (v > dev->pcie_mpss) 
+	if (v > dev->pcie_mpss)
 		return -EINVAL;
 	v <<= 5;
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 85ca36f2136d..688e48e51073 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -525,7 +525,7 @@ static void handle_error_source(struct pcie_device *aerdev,
 
 	if (info->severity == AER_CORRECTABLE) {
 		/*
-		 * Correctable error does not need software intevention.
+		 * Correctable error does not need software intervention.
 		 * No need to go through error recovery process.
 		 */
 		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 403a44374ed5..f1272dc54de1 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -548,7 +548,7 @@ static struct pcie_link_state *alloc_pcie_link_state(struct pci_dev *pdev)
 
 /*
  * pcie_aspm_init_link_state: Initiate PCI express link state.
- * It is called after the pcie and its children devices are scaned.
+ * It is called after the pcie and its children devices are scanned.
  * @pdev: the root port or switch downstream port
  */
 void pcie_aspm_init_link_state(struct pci_dev *pdev)
diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index e56e594ce112..bbc3bdd2b189 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -419,8 +419,8 @@ static void pcie_pme_remove(struct pcie_device *srv)
 
 static struct pcie_port_service_driver pcie_pme_driver = {
 	.name		= "pcie_pme",
-	.port_type 	= PCI_EXP_TYPE_ROOT_PORT,
-	.service 	= PCIE_PORT_SERVICE_PME,
+	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
+	.service	= PCIE_PORT_SERVICE_PME,
 
 	.probe		= pcie_pme_probe,
 	.suspend	= pcie_pme_suspend,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index d2eb80aab569..d525548404d6 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -14,7 +14,7 @@
 #define PCIE_PORT_DEVICE_MAXSERVICES   4
 /*
  * According to the PCI Express Base Specification 2.0, the indices of
- * the MSI-X table entires used by port services must not exceed 31
+ * the MSI-X table entries used by port services must not exceed 31
  */
 #define PCIE_PORT_MAX_MSIX_ENTRIES	32
 
diff --git a/drivers/pci/pcie/portdrv_bus.c b/drivers/pci/pcie/portdrv_bus.c
index 67be55a7f260..87e79a6ffb5a 100644
--- a/drivers/pci/pcie/portdrv_bus.c
+++ b/drivers/pci/pcie/portdrv_bus.c
@@ -18,8 +18,8 @@
 static int pcie_port_bus_match(struct device *dev, struct device_driver *drv);
 
 struct bus_type pcie_port_bus_type = {
-	.name 		= "pci_express",
-	.match 		= pcie_port_bus_match,
+	.name		= "pci_express",
+	.match		= pcie_port_bus_match,
 };
 EXPORT_SYMBOL_GPL(pcie_port_bus_type);
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 08d131f7815b..0b6e76604068 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -46,7 +46,7 @@ static void release_pcie_device(struct device *dev)
  * pcie_port_msix_add_entry - add entry to given array of MSI-X entries
  * @entries: Array of MSI-X entries
  * @new_entry: Index of the entry to add to the array
- * @nr_entries: Number of entries aleady in the array
+ * @nr_entries: Number of entries already in the array
  *
  * Return value: Position of the added entry in the array
  */
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 696caed5fdf5..cd1e57e51aa7 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -390,9 +390,9 @@ static struct pci_driver pcie_portdriver = {
 	.probe		= pcie_portdrv_probe,
 	.remove		= pcie_portdrv_remove,
 
-	.err_handler 	= &pcie_portdrv_err_handler,
+	.err_handler	= &pcie_portdrv_err_handler,
 
-	.driver.pm 	= PCIE_PORTDRV_PM_OPS,
+	.driver.pm	= PCIE_PORTDRV_PM_OPS,
 };
 
 static int __init dmi_pcie_pme_disable_msi(const struct dmi_system_id *d)
@@ -412,7 +412,7 @@ static struct dmi_system_id __initdata pcie_portdrv_dmi_table[] = {
 	 .ident = "MSI Wind U-100",
 	 .matches = {
 		     DMI_MATCH(DMI_SYS_VENDOR,
-		     		"MICRO-STAR INTERNATIONAL CO., LTD"),
+				"MICRO-STAR INTERNATIONAL CO., LTD"),
 		     DMI_MATCH(DMI_PRODUCT_NAME, "U-100"),
 		     },
 	 },
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 5e14f5a51357..38e403dddf6e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -582,7 +582,7 @@ static enum pci_bus_speed agp_speed(int agp3, int agpstat)
 		index = 1;
 	else
 		goto out;
-	
+
 	if (agp3) {
 		index += 2;
 		if (index == 5)
@@ -789,7 +789,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
 	}
 
 	/* Disable MasterAbortMode during probing to avoid reporting
-	   of bus errors (in some architectures) */ 
+	   of bus errors (in some architectures) */
 	pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
 			      bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
@@ -1005,7 +1005,7 @@ void set_pcie_hotplug_bridge(struct pci_dev *pdev)
  * pci_setup_device - fill in class and map information of a device
  * @dev: the device structure to fill
  *
- * Initialize the device structure with information about the device's 
+ * Initialize the device structure with information about the device's
  * vendor,class,memory and IO-space addresses,IRQ lines etc.
  * Called at initialisation of the PCI subsystem and by CardBus services.
  * Returns 0 on success and negative if unknown type of device (not normal,
@@ -1111,7 +1111,7 @@ int pci_setup_device(struct pci_dev *dev)
 			goto bad;
 		/* The PCI-to-PCI bridge spec requires that subtractive
 		   decoding (i.e. transparent) bridge must have programming
-		   interface code of 0x01. */ 
+		   interface code of 0x01. */
 		pci_read_irq(dev);
 		dev->transparent = ((dev->class & 0xff) == 1);
 		pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
@@ -1570,7 +1570,7 @@ static void pcie_write_mrrs(struct pci_dev *dev)
 	 * subsequent read will verify if the value is acceptable or not.
 	 * If the MRRS value provided is not acceptable (e.g., too large),
 	 * shrink the value until it is acceptable to the HW.
- 	 */
+	 */
 	while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
 		rc = pcie_set_readrq(dev, mrrs);
 		if (!rc)
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index cdc7836d7e3d..46d1378f2e9e 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -222,7 +222,7 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
 	default:
 		ret = -EINVAL;
 		break;
-	};
+	}
 
 	return ret;
 }
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 91490453c229..b3b1b9aa8863 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -55,7 +55,7 @@ static void quirk_mellanox_tavor(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR,quirk_mellanox_tavor);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE,quirk_mellanox_tavor);
 
-/* Deal with broken BIOS'es that neglect to enable passive release,
+/* Deal with broken BIOSes that neglect to enable passive release,
    which can cause problems in combination with the 82441FX/PPro MTRRs */
 static void quirk_passive_release(struct pci_dev *dev)
 {
@@ -78,11 +78,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_p
 
 /*  The VIA VP2/VP3/MVP3 seem to have some 'features'. There may be a workaround
     but VIA don't answer queries. If you happen to have good contacts at VIA
-    ask them for me please -- Alan 
-    
-    This appears to be BIOS not version dependent. So presumably there is a 
+    ask them for me please -- Alan
+
+    This appears to be BIOS not version dependent. So presumably there is a
     chipset level fix */
-    
+
 static void quirk_isa_dma_hangs(struct pci_dev *dev)
 {
 	if (!isa_dma_bridge_buggy) {
@@ -97,7 +97,7 @@ static void quirk_isa_dma_hangs(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA,	PCI_DEVICE_ID_VIA_82C586_0,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA,	PCI_DEVICE_ID_VIA_82C596,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_82371SB_0,  quirk_isa_dma_hangs);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1533, 	quirk_isa_dma_hangs);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1533,		quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_1,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_2,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_3,	quirk_isa_dma_hangs);
@@ -157,10 +157,10 @@ static void quirk_triton(struct pci_dev *dev)
 		pci_pci_problems |= PCIPCI_TRITON;
 	}
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82437, 	quirk_triton);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82437VX, 	quirk_triton);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82439, 	quirk_triton);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82439TX, 	quirk_triton);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82437,	quirk_triton);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82437VX,	quirk_triton);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82439,	quirk_triton);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82439TX,	quirk_triton);
 
 /*
  *	VIA Apollo KT133 needs PCI latency patch
@@ -171,7 +171,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82439TX, 	quir
  *      the info on which Mr Breese based his work.
  *
  *	Updated based on further information from the site and also on
- *	information provided by VIA 
+ *	information provided by VIA
  */
 static void quirk_vialatency(struct pci_dev *dev)
 {
@@ -179,7 +179,7 @@ static void quirk_vialatency(struct pci_dev *dev)
 	u8 busarb;
 	/* Ok we have a potential problem chipset here. Now see if we have
 	   a buggy southbridge */
-	   
+
 	p = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686, NULL);
 	if (p!=NULL) {
 		/* 0x40 - 0x4f == 686B, 0x10 - 0x2f == 686A; thanks Dan Hollis */
@@ -194,9 +194,9 @@ static void quirk_vialatency(struct pci_dev *dev)
 		if (p->revision < 0x10 || p->revision > 0x12)
 			goto exit;
 	}
-	
+
 	/*
-	 *	Ok we have the problem. Now set the PCI master grant to 
+	 *	Ok we have the problem. Now set the PCI master grant to
 	 *	occur every master grant. The apparent bug is that under high
 	 *	PCI load (quite common in Linux of course) you can get data
 	 *	loss when the CPU is held off the bus for 3 bus master requests
@@ -209,7 +209,7 @@ static void quirk_vialatency(struct pci_dev *dev)
 	 */
 
 	pci_read_config_byte(dev, 0x76, &busarb);
-	/* Set bit 4 and bi 5 of byte 76 to 0x01 
+	/* Set bit 4 and bi 5 of byte 76 to 0x01
 	   "Master priority rotation on every PCI master grant */
 	busarb &= ~(1<<5);
 	busarb |= (1<<4);
@@ -252,7 +252,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA,	PCI_DEVICE_ID_VIA_82C576,	quirk_vsfx)
  *	that DMA to AGP space. Latency must be set to 0xA and triton
  *	workaround applied too
  *	[Info kindly provided by ALi]
- */	
+ */
 static void quirk_alimagik(struct pci_dev *dev)
 {
 	if ((pci_pci_problems&PCIPCI_ALIMAGIK)==0) {
@@ -260,8 +260,8 @@ static void quirk_alimagik(struct pci_dev *dev)
 		pci_pci_problems |= PCIPCI_ALIMAGIK|PCIPCI_TRITON;
 	}
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, 	PCI_DEVICE_ID_AL_M1647, 	quirk_alimagik);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, 	PCI_DEVICE_ID_AL_M1651, 	quirk_alimagik);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1647,		quirk_alimagik);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1651,		quirk_alimagik);
 
 /*
  *	Natoma has some interesting boundary conditions with Zoran stuff
@@ -274,12 +274,12 @@ static void quirk_natoma(struct pci_dev *dev)
 		pci_pci_problems |= PCIPCI_NATOMA;
 	}
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82441, 	quirk_natoma);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82443LX_0, 	quirk_natoma);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82443LX_1, 	quirk_natoma);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82443BX_0, 	quirk_natoma);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82443BX_1, 	quirk_natoma);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82443BX_2, 	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82443LX_0,	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82443LX_1,	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82443BX_0,	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82443BX_1,	quirk_natoma);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82443BX_2,	quirk_natoma);
 
 /*
  *  This chip can cause PCI parity errors if config register 0xA0 is read
@@ -400,7 +400,7 @@ static void piix4_io_quirk(struct pci_dev *dev, const char *name, unsigned int p
 	/*
 	 * For now we only print it out. Eventually we'll want to
 	 * reserve it (at least if it's in the 0x1000+ range), but
-	 * let's get enough confirmation reports first. 
+	 * let's get enough confirmation reports first.
 	 */
 	base &= -size;
 	dev_info(&dev->dev, "%s PIO at %04x-%04x\n", name, base, base + size - 1);
@@ -425,7 +425,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int
 	}
 	/*
 	 * For now we only print it out. Eventually we'll want to
-	 * reserve it, but let's get enough confirmation reports first. 
+	 * reserve it, but let's get enough confirmation reports first.
 	 */
 	base &= -size;
 	dev_info(&dev->dev, "%s MMIO at %04x-%04x\n", name, base, base + size - 1);
@@ -682,7 +682,7 @@ static void quirk_xio2000a(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_XIO2000A,
 			quirk_xio2000a);
 
-#ifdef CONFIG_X86_IO_APIC 
+#ifdef CONFIG_X86_IO_APIC
 
 #include <asm/io_apic.h>
 
@@ -696,12 +696,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_XIO2000A,
 static void quirk_via_ioapic(struct pci_dev *dev)
 {
 	u8 tmp;
-	
+
 	if (nr_ioapics < 1)
 		tmp = 0;    /* nothing routed to external APIC */
 	else
 		tmp = 0x1f; /* all known bits (4-0) routed to external APIC */
-		
+
 	dev_info(&dev->dev, "%sbling VIA external APIC routing\n",
 	       tmp == 0 ? "Disa" : "Ena");
 
@@ -712,7 +712,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA,	PCI_DEVICE_ID_VIA_82C686,	quirk_via_i
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_VIA,	PCI_DEVICE_ID_VIA_82C686,	quirk_via_ioapic);
 
 /*
- * VIA 8237: Some BIOSs don't set the 'Bypass APIC De-Assert Message' Bit.
+ * VIA 8237: Some BIOSes don't set the 'Bypass APIC De-Assert Message' Bit.
  * This leads to doubled level interrupt rates.
  * Set this bit to get rid of cycle wastage.
  * Otherwise uncritical.
@@ -986,7 +986,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_CYRIX,	PCI_DEVICE_ID_CYRIX_PCI_MASTER, qu
 static void quirk_disable_pxb(struct pci_dev *pdev)
 {
 	u16 config;
-	
+
 	if (pdev->revision != 0x04)		/* Only C0 requires this */
 		return;
 	pci_read_config_word(pdev, 0x40, &config);
@@ -1094,11 +1094,11 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82375,	quirk_e
  * On ASUS P4B boards, the SMBus PCI Device within the ICH2/4 southbridge
  * is not activated. The myth is that Asus said that they do not want the
  * users to be irritated by just another PCI Device in the Win98 device
- * manager. (see the file prog/hotplug/README.p4b in the lm_sensors 
+ * manager. (see the file prog/hotplug/README.p4b in the lm_sensors
  * package 2.7.0 for details)
  *
- * The SMBus PCI Device can be activated by setting a bit in the ICH LPC 
- * bridge. Unfortunately, this device has no subvendor/subdevice ID. So it 
+ * The SMBus PCI Device can be activated by setting a bit in the ICH LPC
+ * bridge. Unfortunately, this device has no subvendor/subdevice ID. So it
  * becomes necessary to do this tweak in two steps -- the chosen trigger
  * is either the Host bridge (preferred) or on-board VGA controller.
  *
@@ -1253,7 +1253,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82815_CGC,	asu
 static void asus_hides_smbus_lpc(struct pci_dev *dev)
 {
 	u16 val;
-	
+
 	if (likely(!asus_hides_smbus))
 		return;
 
@@ -1640,8 +1640,8 @@ static void quirk_disable_intel_boot_interrupt(struct pci_dev *dev)
 	dev_info(&dev->dev, "disabled boot interrupts on device [%04x:%04x]\n",
 		 dev->vendor, dev->device);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_ESB_10, 	quirk_disable_intel_boot_interrupt);
-DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_ESB_10, 	quirk_disable_intel_boot_interrupt);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_ESB_10,	quirk_disable_intel_boot_interrupt);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_ESB_10,	quirk_disable_intel_boot_interrupt);
 
 /*
  * disable boot interrupts on HT-1000
@@ -1673,8 +1673,8 @@ static void quirk_disable_broadcom_boot_interrupt(struct pci_dev *dev)
 	dev_info(&dev->dev, "disabled boot interrupts on device [%04x:%04x]\n",
 		 dev->vendor, dev->device);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS,   PCI_DEVICE_ID_SERVERWORKS_HT1000SB, 	quirk_disable_broadcom_boot_interrupt);
-DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_SERVERWORKS,   PCI_DEVICE_ID_SERVERWORKS_HT1000SB, 	quirk_disable_broadcom_boot_interrupt);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS,   PCI_DEVICE_ID_SERVERWORKS_HT1000SB,	quirk_disable_broadcom_boot_interrupt);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_SERVERWORKS,   PCI_DEVICE_ID_SERVERWORKS_HT1000SB,	quirk_disable_broadcom_boot_interrupt);
 
 /*
  * disable boot interrupts on AMD and ATI chipsets
@@ -1730,8 +1730,8 @@ static void quirk_disable_amd_8111_boot_interrupt(struct pci_dev *dev)
 	dev_info(&dev->dev, "disabled boot interrupts on device [%04x:%04x]\n",
 		 dev->vendor, dev->device);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD,   PCI_DEVICE_ID_AMD_8111_SMBUS, 	quirk_disable_amd_8111_boot_interrupt);
-DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD,   PCI_DEVICE_ID_AMD_8111_SMBUS, 	quirk_disable_amd_8111_boot_interrupt);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD,   PCI_DEVICE_ID_AMD_8111_SMBUS,	quirk_disable_amd_8111_boot_interrupt);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD,   PCI_DEVICE_ID_AMD_8111_SMBUS,	quirk_disable_amd_8111_boot_interrupt);
 #endif /* CONFIG_X86_IO_APIC */
 
 /*
@@ -2127,8 +2127,8 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_PLX, 0x8624, quirk_tile_plx_gen1);
 #ifdef CONFIG_PCI_MSI
 /* Some chipsets do not support MSI. We cannot easily rely on setting
  * PCI_BUS_FLAGS_NO_MSI in its bus flags because there are actually
- * some other busses controlled by the chipset even if Linux is not
- * aware of it.  Instead of setting the flag on all busses in the
+ * some other buses controlled by the chipset even if Linux is not
+ * aware of it.  Instead of setting the flag on all buses in the
  * machine, simply disable MSI globally.
  */
 static void quirk_disable_all_msi(struct pci_dev *dev)
@@ -2288,14 +2288,14 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA,
 			nvenet_msi_disable);
 
 /*
- * Some versions of the MCP55 bridge from nvidia have a legacy irq routing
- * config register.  This register controls the routing of legacy interrupts
- * from devices that route through the MCP55.  If this register is misprogramed
- * interrupts are only sent to the bsp, unlike conventional systems where the
- * irq is broadxast to all online cpus.  Not having this register set
- * properly prevents kdump from booting up properly, so lets make sure that
- * we have it set correctly.
- * Note this is an undocumented register.
+ * Some versions of the MCP55 bridge from Nvidia have a legacy IRQ routing
+ * config register.  This register controls the routing of legacy
+ * interrupts from devices that route through the MCP55.  If this register
+ * is misprogrammed, interrupts are only sent to the BSP, unlike
+ * conventional systems where the IRQ is broadcast to all online CPUs.  Not
+ * having this register set properly prevents kdump from booting up
+ * properly, so let's make sure that we have it set correctly.
+ * Note that this is an undocumented register.
  */
 static void nvbridge_check_legacy_irq_routing(struct pci_dev *dev)
 {
@@ -2626,7 +2626,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0xe091,
 /* Allow manual resource allocation for PCI hotplug bridges
  * via pci=hpmemsize=nnM and pci=hpiosize=nnM parameters. For
  * some PCI-PCI hotplug bridges, like PLX 6254 (former HINT HB6),
- * kernel fails to allocate resources when hotplug device is 
+ * kernel fails to allocate resources when hotplug device is
  * inserted and PCI bus is rescanned.
  */
 static void quirk_hotplug_bridge(struct pci_dev *dev)
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 8fc54b7327bc..1576851028db 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -7,7 +7,7 @@ static void pci_free_resources(struct pci_dev *dev)
 {
 	int i;
 
- 	msi_remove_pci_irq_vectors(dev);
+	msi_remove_pci_irq_vectors(dev);
 
 	pci_cleanup_rom(dev);
 	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index d0627fa9f368..3ff2ac7c14e2 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -1,5 +1,5 @@
 /*
- * 	PCI searching functions.
+ *	PCI searching functions.
  *
  *	Copyright (C) 1993 -- 1997 Drew Eckhardt, Frederic Potter,
  *					David Mosberger-Tang
@@ -96,12 +96,12 @@ struct pci_bus * pci_find_bus(int domain, int busnr)
  * pci_find_next_bus - begin or continue searching for a PCI bus
  * @from: Previous PCI bus found, or %NULL for new search.
  *
- * Iterates through the list of known PCI busses.  A new search is
+ * Iterates through the list of known PCI buses.  A new search is
  * initiated by passing %NULL as the @from argument.  Otherwise if
  * @from is not %NULL, searches continue from next device on the
  * global list.
  */
-struct pci_bus * 
+struct pci_bus *
 pci_find_next_bus(const struct pci_bus *from)
 {
 	struct list_head *n;
@@ -119,11 +119,11 @@ pci_find_next_bus(const struct pci_bus *from)
 /**
  * pci_get_slot - locate PCI device for a given PCI slot
  * @bus: PCI bus on which desired PCI device resides
- * @devfn: encodes number of PCI slot in which the desired PCI 
- * device resides and the logical device number within that slot 
+ * @devfn: encodes number of PCI slot in which the desired PCI
+ * device resides and the logical device number within that slot
  * in case of multi-function devices.
  *
- * Given a PCI bus and slot/function number, the desired PCI device 
+ * Given a PCI bus and slot/function number, the desired PCI device
  * is located in the list of PCI devices.
  * If the device is found, its reference count is increased and this
  * function returns a pointer to its data structure.  The caller must
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 4ce83b26ae9e..219a4106480a 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -292,8 +292,8 @@ static void assign_requested_resources_sorted(struct list_head *head,
 				      (!(res->flags & IORESOURCE_ROM_ENABLE))))
 					add_to_list(fail_head,
 						    dev_res->dev, res,
-						    0 /* dont care */,
-						    0 /* dont care */);
+						    0 /* don't care */,
+						    0 /* don't care */);
 			}
 			reset_resource(res);
 		}
@@ -667,9 +667,9 @@ static void pci_bridge_check_ranges(struct pci_bus *bus)
 	if (!io) {
 		pci_write_config_word(bridge, PCI_IO_BASE, 0xf0f0);
 		pci_read_config_word(bridge, PCI_IO_BASE, &io);
- 		pci_write_config_word(bridge, PCI_IO_BASE, 0x0);
- 	}
- 	if (io)
+		pci_write_config_word(bridge, PCI_IO_BASE, 0x0);
+	}
+	if (io)
 		b_res[0].flags |= IORESOURCE_IO;
 	/*  DECchip 21050 pass 2 errata: the bridge may miss an address
 	    disconnect boundary by one PCI data phase.
@@ -819,7 +819,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 	resource_size_t min_align, align;
 
 	if (!b_res)
- 		return;
+		return;
 
 	min_align = window_alignment(bus, IORESOURCE_IO);
 	list_for_each_entry(dev, &bus->devices, bus_list) {
@@ -950,7 +950,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			if (realloc_head && i >= PCI_IOV_RESOURCES &&
 					i <= PCI_IOV_RESOURCE_END) {
 				r->end = r->start - 1;
-				add_to_list(realloc_head, dev, r, r_size, 0/* dont' care */);
+				add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
 				children_add_size += r_size;
 				continue;
 			}
@@ -1456,8 +1456,8 @@ static enum enable_type pci_realloc_detect(struct pci_bus *bus,
 
 /*
  * first try will not touch pci bridge res
- * second  and later try will clear small leaf bridge res
- * will stop till to the max  deepth if can not find good one
+ * second and later try will clear small leaf bridge res
+ * will stop till to the max depth if can not find good one
  */
 void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus)
 {
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 07f2eddc09ce..83c4d3bc47ab 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -159,7 +159,7 @@ resource_size_t __weak pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
 	return 0;
 }
 
-static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev, 
+static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev,
 		int resno, resource_size_t size)
 {
 	struct resource *root, *conflict;
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index c1e9284a677b..448ca562d1f8 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -53,7 +53,7 @@ static ssize_t address_read_file(struct pci_slot *slot, char *buf)
 static const char *pci_bus_speed_strings[] = {
 	"33 MHz PCI",		/* 0x00 */
 	"66 MHz PCI",		/* 0x01 */
-	"66 MHz PCI-X", 	/* 0x02 */
+	"66 MHz PCI-X",		/* 0x02 */
 	"100 MHz PCI-X",	/* 0x03 */
 	"133 MHz PCI-X",	/* 0x04 */
 	NULL,			/* 0x05 */
diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c
index e1c1ec540893..24750a1b39b6 100644
--- a/drivers/pci/syscall.c
+++ b/drivers/pci/syscall.c
@@ -44,7 +44,7 @@ SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn,
 	default:
 		err = -EINVAL;
 		goto error;
-	};
+	}
 
 	err = -EIO;
 	if (cfg_ret != PCIBIOS_SUCCESSFUL)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 87cce50bd121..009b02481436 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -26,11 +26,11 @@ struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
 		__u8	multiple: 3;	/* log2 number of messages */
-		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
-		__u8	pos;	 	/* Location of the msi capability */
-		__u16	entry_nr;    	/* specific enabled entry 	  */
-		unsigned default_irq;	/* default pre-assigned irq	  */
+		__u8	maskbit	: 1;	/* mask-pending bit supported ? */
+		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit */
+		__u8	pos;		/* Location of the msi capability */
+		__u16	entry_nr;	/* specific enabled entry */
+		unsigned default_irq;	/* default pre-assigned irq */
 	} msi_attrib;
 
 	u32 masked;			/* mask bits */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 835ec7bf6c05..1084a15175e0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -32,7 +32,6 @@
 #include <linux/irqreturn.h>
 #include <uapi/linux/pci.h>
 
-/* Include the ID list */
 #include <linux/pci_ids.h>
 
 /*
@@ -42,9 +41,10 @@
  *
  *	7:3 = slot
  *	2:0 = function
- * PCI_DEVFN(), PCI_SLOT(), and PCI_FUNC() are defined uapi/linux/pci.h
+ *
+ * PCI_DEVFN(), PCI_SLOT(), and PCI_FUNC() are defined in uapi/linux/pci.h.
  * In the interest of not exposing interfaces to user-space unnecessarily,
- * the following kernel only defines are being added here.
+ * the following kernel-only defines are being added here.
  */
 #define PCI_DEVID(bus, devfn)  ((((u16)bus) << 8) | devfn)
 /* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */
@@ -153,10 +153,10 @@ enum pcie_reset_state {
 	/* Reset is NOT asserted (Use to deassert reset) */
 	pcie_deassert_reset = (__force pcie_reset_state_t) 1,
 
-	/* Use #PERST to reset PCI-E device */
+	/* Use #PERST to reset PCIe device */
 	pcie_warm_reset = (__force pcie_reset_state_t) 2,
 
-	/* Use PCI-E Hot Reset to reset device */
+	/* Use PCIe Hot Reset to reset device */
 	pcie_hot_reset = (__force pcie_reset_state_t) 3
 };
 
@@ -259,13 +259,13 @@ struct pci_dev {
 	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
 	u8		revision;	/* PCI revision, low byte of class word */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
-	u8		pcie_cap;	/* PCI-E capability offset */
+	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
 	u8		msix_cap;	/* MSI-X capability offset */
-	u8		pcie_mpss:3;	/* PCI-E Max Payload Size Supported */
+	u8		pcie_mpss:3;	/* PCIe Max Payload Size Supported */
 	u8		rom_base_reg;	/* which config register controls the ROM */
-	u8		pin;  		/* which interrupt pin this device uses */
-	u16		pcie_flags_reg;	/* cached PCI-E Capabilities Register */
+	u8		pin;		/* which interrupt pin this device uses */
+	u16		pcie_flags_reg;	/* cached PCIe Capabilities Register */
 
 	struct pci_driver *driver;	/* which driver has allocated this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
@@ -300,7 +300,7 @@ struct pci_dev {
 	unsigned int	d3cold_delay;	/* D3cold->D0 transition time in ms */
 
 #ifdef CONFIG_PCIEASPM
-	struct pcie_link_state	*link_state;	/* ASPM link state. */
+	struct pcie_link_state	*link_state;	/* ASPM link state */
 #endif
 
 	pci_channel_state_t error_state;	/* current connectivity state */
@@ -317,7 +317,7 @@ struct pci_dev {
 
 	bool match_driver;		/* Skip attaching driver */
 	/* These fields are used by common fixups */
-	unsigned int	transparent:1;	/* Transparent PCI bridge */
+	unsigned int	transparent:1;	/* Subtractive decode PCI bridge */
 	unsigned int	multifunction:1;/* Part of multi-function device */
 	/* keep track of device state */
 	unsigned int	is_added:1;
@@ -326,7 +326,7 @@ struct pci_dev {
 	unsigned int	block_cfg_access:1;	/* config space access is blocked */
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
 	unsigned int	irq_reroute_variant:2;	/* device needs IRQ rerouting variant */
-	unsigned int 	msi_enabled:1;
+	unsigned int	msi_enabled:1;
 	unsigned int	msix_enabled:1;
 	unsigned int	ari_enabled:1;	/* ARI forwarding */
 	unsigned int	is_managed:1;
@@ -371,7 +371,6 @@ static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
 	if (dev->is_virtfn)
 		dev = dev->physfn;
 #endif
-
 	return dev;
 }
 
@@ -456,7 +455,7 @@ struct pci_bus {
 	char		name[48];
 
 	unsigned short  bridge_ctl;	/* manage NO_ISA/FBB/et al behaviors */
-	pci_bus_flags_t bus_flags;	/* Inherited by child busses */
+	pci_bus_flags_t bus_flags;	/* inherited by child buses */
 	struct device		*bridge;
 	struct device		dev;
 	struct bin_attribute	*legacy_io; /* legacy I/O for this bus */
@@ -468,7 +467,7 @@ struct pci_bus {
 #define to_pci_bus(n)	container_of(n, struct pci_bus, dev)
 
 /*
- * Returns true if the pci bus is root (behind host-pci bridge),
+ * Returns true if the PCI bus is root (behind host-PCI bridge),
  * false otherwise
  *
  * Some code assumes that "bus->self == NULL" means that bus is a root bus.
@@ -510,7 +509,7 @@ static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { return false;
 #define PCIBIOS_BUFFER_TOO_SMALL	0x89
 
 /*
- * Translate above to generic errno for passing back through non-pci.
+ * Translate above to generic errno for passing back through non-PCI code.
  */
 static inline int pcibios_err_to_errno(int err)
 {
@@ -561,11 +560,12 @@ struct pci_dynids {
 	struct list_head list;      /* for IDs added at runtime */
 };
 
-/* ---------------------------------------------------------------- */
-/** PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
- *  a set of callbacks in struct pci_error_handlers, then that device driver
- *  will be notified of PCI bus errors, and will be driven to recovery
- *  when an error occurs.
+
+/*
+ * PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
+ * a set of callbacks in struct pci_error_handlers, that device driver
+ * will be notified of PCI bus errors, and will be driven to recovery
+ * when an error occurs.
  */
 
 typedef unsigned int __bitwise pci_ers_result_t;
@@ -609,7 +609,6 @@ struct pci_error_handlers {
 	void (*resume)(struct pci_dev *dev);
 };
 
-/* ---------------------------------------------------------------- */
 
 struct module;
 struct pci_driver {
@@ -713,10 +712,10 @@ extern enum pcie_bus_config_types pcie_bus_config;
 
 extern struct bus_type pci_bus_type;
 
-/* Do NOT directly access these two variables, unless you are arch specific pci
- * code, or pci core code. */
+/* Do NOT directly access these two variables, unless you are arch-specific PCI
+ * code, or PCI core code. */
 extern struct list_head pci_root_buses;	/* list of all known PCI buses */
-/* Some device drivers need know if pci is initiated */
+/* Some device drivers need know if PCI is initiated */
 int no_pci_devices(void);
 
 void pcibios_resource_survey_bus(struct pci_bus *bus);
@@ -724,7 +723,7 @@ void pcibios_add_bus(struct pci_bus *bus);
 void pcibios_remove_bus(struct pci_bus *bus);
 void pcibios_fixup_bus(struct pci_bus *);
 int __must_check pcibios_enable_device(struct pci_dev *, int mask);
-/* Architecture specific versions may override this (weak) */
+/* Architecture-specific versions may override this (weak) */
 char *pcibios_setup(char *str);
 
 /* Used only when drivers/pci/setup.c is used */
@@ -1258,7 +1257,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev);
 
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
- * a PCI domain is defined to be a set of PCI busses which share
+ * a PCI domain is defined to be a set of PCI buses which share
  * configuration space.
  */
 #ifdef CONFIG_PCI_DOMAINS
@@ -1672,7 +1671,7 @@ extern u8 pci_cache_line_size;
 extern unsigned long pci_hotplug_io_size;
 extern unsigned long pci_hotplug_mem_size;
 
-/* Architecture specific versions may override these (weak) */
+/* Architecture-specific versions may override these (weak) */
 int pcibios_add_platform_entries(struct pci_dev *dev);
 void pcibios_disable_device(struct pci_dev *dev);
 void pcibios_set_master(struct pci_dev *dev);
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 430dd963707b..a2e2f1d17e16 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -39,8 +39,8 @@
  * @hardware_test: Called to run a specified hardware test on the specified
  * slot.
  * @get_power_status: Called to get the current power status of a slot.
- * 	If this field is NULL, the value passed in the struct hotplug_slot_info
- * 	will be used when this value is requested by a user.
+ *	If this field is NULL, the value passed in the struct hotplug_slot_info
+ *	will be used when this value is requested by a user.
  * @get_attention_status: Called to get the current attention status of a slot.
  *	If this field is NULL, the value passed in the struct hotplug_slot_info
  *	will be used when this value is requested by a user.
@@ -191,4 +191,3 @@ static inline int pci_get_hp_params(struct pci_dev *dev,
 
 void pci_configure_slot(struct pci_dev *dev);
 #endif
-
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 9572669eea97..4f1089f2cc98 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -23,7 +23,7 @@
 #define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
 
 struct pcie_device {
-	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
+	int		irq;	    /* Service IRQ/MSI/MSI-X Vector */
 	struct pci_dev *port;	    /* Root/Upstream/Downstream Port */
 	u32		service;    /* Port service this device represents */
 	void		*priv_data; /* Service Private Data */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 0890556f779e..4a98e85438a7 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -13,10 +13,10 @@
  *	PCI to PCI Bridge Specification
  *	PCI System Design Guide
  *
- * 	For hypertransport information, please consult the following manuals
- * 	from http://www.hypertransport.org
+ *	For HyperTransport information, please consult the following manuals
+ *	from http://www.hypertransport.org
  *
- *	The Hypertransport I/O Link Specification
+ *	The HyperTransport I/O Link Specification
  */
 
 #ifndef LINUX_PCI_REGS_H
@@ -37,7 +37,7 @@
 #define  PCI_COMMAND_INVALIDATE	0x10	/* Use memory write and invalidate */
 #define  PCI_COMMAND_VGA_PALETTE 0x20	/* Enable palette snooping */
 #define  PCI_COMMAND_PARITY	0x40	/* Enable parity checking */
-#define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
+#define  PCI_COMMAND_WAIT	0x80	/* Enable address/data stepping */
 #define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
 #define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
 #define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
@@ -45,7 +45,7 @@
 #define PCI_STATUS		0x06	/* 16 bits */
 #define  PCI_STATUS_INTERRUPT	0x08	/* Interrupt status */
 #define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
-#define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_66MHZ	0x20	/* Support 66 MHz PCI 2.1 bus */
 #define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
 #define  PCI_STATUS_FAST_BACK	0x80	/* Accept fast-back to back */
 #define  PCI_STATUS_PARITY	0x100	/* Detected parity error */
@@ -205,14 +205,14 @@
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
 #define  PCI_CAP_ID_HT		0x08	/* HyperTransport */
-#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor-Specific */
 #define  PCI_CAP_ID_DBG		0x0A	/* Debug port */
 #define  PCI_CAP_ID_CCRC	0x0B	/* CompactPCI Central Resource Control */
-#define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
+#define  PCI_CAP_ID_SHPC	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
 #define  PCI_CAP_ID_AGP3	0x0E	/* AGP Target PCI-PCI bridge */
 #define  PCI_CAP_ID_SECDEV	0x0F	/* Secure Device */
-#define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
+#define  PCI_CAP_ID_EXP		0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
 #define  PCI_CAP_ID_SATA	0x12	/* SATA Data/Index Conf. */
 #define  PCI_CAP_ID_AF		0x13	/* PCI Advanced Features */
@@ -268,8 +268,8 @@
 #define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of requests */
 #define  PCI_AGP_COMMAND_SBA	0x0200	/* Sideband addressing enabled */
 #define  PCI_AGP_COMMAND_AGP	0x0100	/* Allow processing of AGP transactions */
-#define  PCI_AGP_COMMAND_64BIT	0x0020 	/* Allow processing of 64-bit addresses */
-#define  PCI_AGP_COMMAND_FW	0x0010 	/* Force FW transfers */
+#define  PCI_AGP_COMMAND_64BIT	0x0020	/* Allow processing of 64-bit addresses */
+#define  PCI_AGP_COMMAND_FW	0x0010	/* Force FW transfers */
 #define  PCI_AGP_COMMAND_RATE4	0x0004	/* Use 4x rate */
 #define  PCI_AGP_COMMAND_RATE2	0x0002	/* Use 2x rate */
 #define  PCI_AGP_COMMAND_RATE1	0x0001	/* Use 1x rate */
@@ -321,7 +321,7 @@
 #define  PCI_MSIX_PBA_OFFSET	0xfffffff8 /* Offset into specified BAR */
 #define PCI_CAP_MSIX_SIZEOF	12	/* size of MSIX registers */
 
-/* MSI-X entry's format */
+/* MSI-X Table entry format */
 #define PCI_MSIX_ENTRY_SIZE		16
 #define  PCI_MSIX_ENTRY_LOWER_ADDR	0
 #define  PCI_MSIX_ENTRY_UPPER_ADDR	4
@@ -372,7 +372,7 @@
 #define  PCI_X_CMD_SPLIT_16	0x0060	/* Max 16 */
 #define  PCI_X_CMD_SPLIT_32	0x0070	/* Max 32 */
 #define  PCI_X_CMD_MAX_SPLIT	0x0070	/* Max Outstanding Split Transactions */
-#define  PCI_X_CMD_VERSION(x) 	(((x) >> 12) & 3) /* Version */
+#define  PCI_X_CMD_VERSION(x)	(((x) >> 12) & 3) /* Version */
 #define PCI_X_STATUS		4	/* PCI-X capabilities */
 #define  PCI_X_STATUS_DEVFN	0x000000ff	/* A copy of devfn */
 #define  PCI_X_STATUS_BUS	0x0000ff00	/* A copy of bus nr */
@@ -407,8 +407,8 @@
 
 /* PCI Bridge Subsystem ID registers */
 
-#define PCI_SSVID_VENDOR_ID     4	/* PCI-Bridge subsystem vendor id register */
-#define PCI_SSVID_DEVICE_ID     6	/* PCI-Bridge subsystem device id register */
+#define PCI_SSVID_VENDOR_ID     4	/* PCI Bridge subsystem vendor ID */
+#define PCI_SSVID_DEVICE_ID     6	/* PCI Bridge subsystem device ID */
 
 /* PCI Express capability registers */
 
@@ -484,12 +484,12 @@
 #define  PCI_EXP_LNKCTL_CLKREQ_EN 0x0100 /* Enable clkreq */
 #define  PCI_EXP_LNKCTL_HAWD	0x0200	/* Hardware Autonomous Width Disable */
 #define  PCI_EXP_LNKCTL_LBMIE	0x0400	/* Link Bandwidth Management Interrupt Enable */
-#define  PCI_EXP_LNKCTL_LABIE	0x0800	/* Lnk Autonomous Bandwidth Interrupt Enable */
+#define  PCI_EXP_LNKCTL_LABIE	0x0800	/* Link Autonomous Bandwidth Interrupt Enable */
 #define PCI_EXP_LNKSTA		18	/* Link Status */
 #define  PCI_EXP_LNKSTA_CLS	0x000f	/* Current Link Speed */
 #define  PCI_EXP_LNKSTA_CLS_2_5GB 0x0001 /* Current Link Speed 2.5GT/s */
 #define  PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */
-#define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Nogotiated Link Width */
+#define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_SHIFT 4	/* start of NLW mask in link status */
 #define  PCI_EXP_LNKSTA_LT	0x0800	/* Link Training */
 #define  PCI_EXP_LNKSTA_SLC	0x1000	/* Slot Clock Configuration */
@@ -593,7 +593,7 @@
 #define PCI_EXT_CAP_ID_MFVC	0x08	/* Multi-Function VC Capability */
 #define PCI_EXT_CAP_ID_VC9	0x09	/* same as _VC */
 #define PCI_EXT_CAP_ID_RCRB	0x0A	/* Root Complex RB? */
-#define PCI_EXT_CAP_ID_VNDR	0x0B	/* Vendor Specific */
+#define PCI_EXT_CAP_ID_VNDR	0x0B	/* Vendor-Specific */
 #define PCI_EXT_CAP_ID_CAC	0x0C	/* Config Access - obsolete */
 #define PCI_EXT_CAP_ID_ACS	0x0D	/* Access Control Services */
 #define PCI_EXT_CAP_ID_ARI	0x0E	/* Alternate Routing ID */
@@ -602,12 +602,12 @@
 #define PCI_EXT_CAP_ID_MRIOV	0x11	/* Multi Root I/O Virtualization */
 #define PCI_EXT_CAP_ID_MCAST	0x12	/* Multicast */
 #define PCI_EXT_CAP_ID_PRI	0x13	/* Page Request Interface */
-#define PCI_EXT_CAP_ID_AMD_XXX	0x14	/* reserved for AMD */
-#define PCI_EXT_CAP_ID_REBAR	0x15	/* resizable BAR */
-#define PCI_EXT_CAP_ID_DPA	0x16	/* dynamic power alloc */
-#define PCI_EXT_CAP_ID_TPH	0x17	/* TPH request */
-#define PCI_EXT_CAP_ID_LTR	0x18	/* latency tolerance reporting */
-#define PCI_EXT_CAP_ID_SECPCI	0x19	/* Secondary PCIe */
+#define PCI_EXT_CAP_ID_AMD_XXX	0x14	/* Reserved for AMD */
+#define PCI_EXT_CAP_ID_REBAR	0x15	/* Resizable BAR */
+#define PCI_EXT_CAP_ID_DPA	0x16	/* Dynamic Power Allocation */
+#define PCI_EXT_CAP_ID_TPH	0x17	/* TPH Requester */
+#define PCI_EXT_CAP_ID_LTR	0x18	/* Latency Tolerance Reporting */
+#define PCI_EXT_CAP_ID_SECPCI	0x19	/* Secondary PCIe Capability */
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PASID
@@ -667,9 +667,9 @@
 #define PCI_ERR_ROOT_COR_RCV		0x00000001	/* ERR_COR Received */
 /* Multi ERR_COR Received */
 #define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002
-/* ERR_FATAL/NONFATAL Recevied */
+/* ERR_FATAL/NONFATAL Received */
 #define PCI_ERR_ROOT_UNCOR_RCV		0x00000004
-/* Multi ERR_FATAL/NONFATAL Recevied */
+/* Multi ERR_FATAL/NONFATAL Received */
 #define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008
 #define PCI_ERR_ROOT_FIRST_FATAL	0x00000010	/* First Fatal */
 #define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020	/* Non-Fatal Received */
@@ -678,7 +678,7 @@
 
 /* Virtual Channel */
 #define PCI_VC_PORT_REG1	4
-#define  PCI_VC_REG1_EVCC	0x7	/* extended vc count */
+#define  PCI_VC_REG1_EVCC	0x7	/* extended VC count */
 #define PCI_VC_PORT_REG2	8
 #define  PCI_VC_REG2_32_PHASE	0x2
 #define  PCI_VC_REG2_64_PHASE	0x4
@@ -711,7 +711,7 @@
 #define  PCI_VNDR_HEADER_LEN(x)	(((x) >> 20) & 0xfff)
 
 /*
- * Hypertransport sub capability types
+ * HyperTransport sub capability types
  *
  * Unfortunately there are both 3 bit and 5 bit capability types defined
  * in the HT spec, catering for that is a little messy. You probably don't
@@ -739,8 +739,8 @@
 #define HT_CAPTYPE_DIRECT_ROUTE	0xB0	/* Direct routing configuration */
 #define HT_CAPTYPE_VCSET	0xB8	/* Virtual Channel configuration */
 #define HT_CAPTYPE_ERROR_RETRY	0xC0	/* Retry on error configuration */
-#define HT_CAPTYPE_GEN3		0xD0	/* Generation 3 hypertransport configuration */
-#define HT_CAPTYPE_PM		0xE0	/* Hypertransport powermanagement configuration */
+#define HT_CAPTYPE_GEN3		0xD0	/* Generation 3 HyperTransport configuration */
+#define HT_CAPTYPE_PM		0xE0	/* HyperTransport power management configuration */
 #define HT_CAP_SIZEOF_LONG	28	/* slave & primary */
 #define HT_CAP_SIZEOF_SHORT	24	/* host & secondary */
 
@@ -777,14 +777,14 @@
 #define PCI_PRI_ALLOC_REQ	0x0c	/* PRI max reqs allowed */
 #define PCI_EXT_CAP_PRI_SIZEOF	16
 
-/* PASID capability */
+/* Process Address Space ID */
 #define PCI_PASID_CAP		0x04    /* PASID feature register */
 #define  PCI_PASID_CAP_EXEC	0x02	/* Exec permissions Supported */
-#define  PCI_PASID_CAP_PRIV	0x04	/* Priviledge Mode Supported */
+#define  PCI_PASID_CAP_PRIV	0x04	/* Privilege Mode Supported */
 #define PCI_PASID_CTRL		0x06    /* PASID control register */
 #define  PCI_PASID_CTRL_ENABLE	0x01	/* Enable bit */
 #define  PCI_PASID_CTRL_EXEC	0x02	/* Exec permissions Enable */
-#define  PCI_PASID_CTRL_PRIV	0x04	/* Priviledge Mode Enable */
+#define  PCI_PASID_CTRL_PRIV	0x04	/* Privilege Mode Enable */
 #define PCI_EXT_CAP_PASID_SIZEOF	8
 
 /* Single Root I/O Virtualization */
@@ -839,22 +839,22 @@
 #define PCI_ACS_CTRL		0x06	/* ACS Control Register */
 #define PCI_ACS_EGRESS_CTL_V	0x08	/* ACS Egress Control Vector */
 
-#define PCI_VSEC_HDR		4	/* extended cap - vendor specific */
+#define PCI_VSEC_HDR		4	/* extended cap - vendor-specific */
 #define  PCI_VSEC_HDR_LEN_SHIFT	20	/* shift for length field */
 
-/* sata capability */
+/* SATA capability */
 #define PCI_SATA_REGS		4	/* SATA REGs specifier */
 #define  PCI_SATA_REGS_MASK	0xF	/* location - BAR#/inline */
 #define  PCI_SATA_REGS_INLINE	0xF	/* REGS in config space */
 #define PCI_SATA_SIZEOF_SHORT	8
 #define PCI_SATA_SIZEOF_LONG	16
 
-/* resizable BARs */
+/* Resizable BARs */
 #define PCI_REBAR_CTRL		8	/* control register */
 #define  PCI_REBAR_CTRL_NBAR_MASK	(7 << 5)	/* mask for # bars */
 #define  PCI_REBAR_CTRL_NBAR_SHIFT	5	/* shift for # bars */
 
-/* dynamic power allocation */
+/* Dynamic Power Allocation */
 #define PCI_DPA_CAP		4	/* capability register */
 #define  PCI_DPA_CAP_SUBSTATE_MASK	0x1F	/* # substates - 1 */
 #define PCI_DPA_BASE_SIZEOF	16	/* size with 0 substates */
-- 
cgit v1.2.3


From 65c5189a2b57b9aa1d89e4b79da39928257c9505 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Nov 2013 08:57:26 -0800
Subject: pkt_sched: fq: warn users using defrate

Commit 7eec4174ff29 ("pkt_sched: fq: fix non TCP flows pacing")
obsoleted TCA_FQ_FLOW_DEFAULT_RATE without notice for the users.

Suggested by David Miller

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  4 +---
 net/sched/sch_fq.c             | 10 ++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 307f293477e8..885001b62c83 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -763,9 +763,7 @@ enum {
 
 	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */
 
-	TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate,
-				  * use the following rate
-				  */
+	TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */
 
 	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index d4fa38e4af80..26906ab51c52 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -88,7 +88,6 @@ struct fq_sched_data {
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
 	u32		initial_quantum;
-	u32		flow_default_rate;/* rate per flow : bytes per second */
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	struct rb_root	*fq_root;
@@ -650,7 +649,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
 
 	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
-		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
+		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
 
 	if (tb[TCA_FQ_FLOW_MAX_RATE])
 		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
@@ -699,7 +699,6 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->flow_plimit		= 100;
 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
-	q->flow_default_rate	= 0;
 	q->flow_max_rate	= ~0U;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
@@ -726,9 +725,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (opts == NULL)
 		goto nla_put_failure;
 
-	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore,
-	 * do not bother giving its value
-	 */
+	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
-- 
cgit v1.2.3


From f52ed89971adbe79b6438c459814034707b8ab91 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Nov 2013 08:58:14 -0800
Subject: pkt_sched: fq: fix pacing for small frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For performance reasons, sch_fq tried hard to not setup timers for every
sent packet, using a quantum based heuristic : A delay is setup only if
the flow exhausted its credit.

Problem is that application limited flows can refill their credit
for every queued packet, and they can evade pacing.

This problem can also be triggered when TCP flows use small MSS values,
as TSO auto sizing builds packets that are smaller than the default fq
quantum (3028 bytes)

This patch adds a 40 ms delay to guard flow credit refill.

Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  3 +++
 net/sched/sch_fq.c             | 22 ++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 885001b62c83..a806687ad98f 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -768,6 +768,9 @@ enum {
 	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
 
 	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */
+
+	TCA_FQ_FLOW_REFILL_DELAY,	/* flow credit refill delay in usec */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 26906ab51c52..95d843961907 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -88,6 +88,7 @@ struct fq_sched_data {
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
 	u32		initial_quantum;
+	u32		flow_refill_delay;
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	struct rb_root	*fq_root;
@@ -114,6 +115,7 @@ static struct fq_flow detached, throttled;
 static void fq_flow_set_detached(struct fq_flow *f)
 {
 	f->next = &detached;
+	f->age = jiffies;
 }
 
 static bool fq_flow_is_detached(const struct fq_flow *f)
@@ -366,17 +368,20 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	f->qlen++;
-	flow_queue_add(f, skb);
 	if (skb_is_retransmit(skb))
 		q->stat_tcp_retrans++;
 	sch->qstats.backlog += qdisc_pkt_len(skb);
 	if (fq_flow_is_detached(f)) {
 		fq_flow_add_tail(&q->new_flows, f);
-		if (q->quantum > f->credit)
-			f->credit = q->quantum;
+		if (time_after(jiffies, f->age + q->flow_refill_delay))
+			f->credit = max_t(u32, f->credit, q->quantum);
 		q->inactive_flows--;
 		qdisc_unthrottled(sch);
 	}
+
+	/* Note: this overwrites f->age */
+	flow_queue_add(f, skb);
+
 	if (unlikely(f == &q->internal)) {
 		q->stat_internal_packets++;
 		qdisc_unthrottled(sch);
@@ -454,7 +459,6 @@ begin:
 			fq_flow_add_tail(&q->old_flows, f);
 		} else {
 			fq_flow_set_detached(f);
-			f->age = jiffies;
 			q->inactive_flows++;
 		}
 		goto begin;
@@ -608,6 +612,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -664,6 +669,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 			err = -EINVAL;
 	}
 
+	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+	}
+
 	if (!err)
 		err = fq_resize(q, fq_log);
 
@@ -699,6 +710,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->flow_plimit		= 100;
 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
+	q->flow_refill_delay	= msecs_to_jiffies(40);
 	q->flow_max_rate	= ~0U;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
@@ -733,6 +745,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
 	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From c0f8bd146a8b3e630798561c605f5669823107af Mon Sep 17 00:00:00 2001
From: Aurelien Jarno <aurelien@aurel32.net>
Date: Thu, 14 Nov 2013 15:16:19 +1100
Subject: UAPI: include <asm/byteorder.h> in linux/raid/md_p.h

linux/raid/md_p.h is using conditionals depending on endianess and fails
with an error if neither of __BIG_ENDIAN, __LITTLE_ENDIAN or
__BYTE_ORDER are defined, but it doesn't include any header which can
define these constants. This make this header unusable alone.

This patch adds a #include <asm/byteorder.h> at the beginning of this
header to make it usable alone. This is needed to compile klibc on MIPS.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/uapi/linux/raid/md_p.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe1a5406d4d9..f7cf7f351144 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -16,6 +16,7 @@
 #define _MD_P_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 /*
  * RAID superblock.
-- 
cgit v1.2.3


From 2ecf7536b2787580616d23b6507005d930975ca0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:33 +0100
Subject: quota/genetlink: use proper genetlink multicast APIs

The quota code is abusing the genetlink API and is using
its family ID as the multicast group ID, which is invalid
and may belong to somebody else (and likely will.)

Make the quota code use the correct API, but since this
is already used as-is by userspace, reserve a family ID
for this code and also reserve that group ID to not break
userspace assumptions.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/quota/netlink.c             | 17 +++++++++++++++--
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        | 10 ++++++++--
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 16e8abb7709b..aa22fe03b76c 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -11,13 +11,23 @@
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
-	.id = GENL_ID_GENERATE,
+	/*
+	 * Needed due to multicast group ID abuse - old code assumed
+	 * the family ID was also a valid multicast group ID (which
+	 * isn't true) and userspace might thus rely on it. Assign a
+	 * static ID for this group to make dealing with that easier.
+	 */
+	.id = GENL_ID_VFS_DQUOT,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
 	.maxattr = QUOTA_NL_A_MAX,
 };
 
+static struct genl_multicast_group quota_mcgrp = {
+	.name = "events",
+};
+
 /**
  * quota_send_warning - Send warning to userspace about exceeded quota
  * @type: The quota type: USRQQUOTA, GRPQUOTA,...
@@ -78,7 +88,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	genlmsg_multicast(skb, 0, quota_mcgrp.id, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -92,6 +102,9 @@ static int __init quota_init(void)
 	if (genl_register_family(&quota_genl_family) != 0)
 		printk(KERN_ERR
 		       "VFS: Failed to create quota netlink interface.\n");
+	if (genl_register_mc_group(&quota_genl_family, &quota_mcgrp))
+		printk(KERN_ERR
+		       "VFS: Failed to register quota mcast group.\n");
 	return 0;
 };
 
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c880a417d8a9..1af72d8228e0 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -27,6 +27,7 @@ struct genlmsghdr {
  */
 #define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
+#define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 
 /**************************************************************************
  * Controller
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 353909d46dda..bee91a7527a5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -69,8 +69,11 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
  * abuses the API and thinks it can statically use group 1.
  * That group will typically conflict with other groups that
  * any proper users use.
+ * Bit 17 is marked as already used since the VFS quota code
+ * also abused this API and relied on family == group ID, we
+ * cater to that by giving it a static family and group ID.
  */
-static unsigned long mc_group_start = 0x3;
+static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_VFS_DQUOT);
 static unsigned long *mc_groups = &mc_group_start;
 static unsigned long mc_groups_longs = 1;
 
@@ -130,7 +133,8 @@ static u16 genl_generate_id(void)
 	int i;
 
 	for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
-		if (!genl_family_find_byid(id_gen_idx))
+		if (id_gen_idx != GENL_ID_VFS_DQUOT &&
+		    !genl_family_find_byid(id_gen_idx))
 			return id_gen_idx;
 		if (++id_gen_idx > GENL_MAX_ID)
 			id_gen_idx = GENL_MIN_ID;
@@ -169,6 +173,8 @@ int genl_register_mc_group(struct genl_family *family,
 		id = GENL_ID_CTRL;
 	else if (strcmp(family->name, "NET_DM") == 0)
 		id = 1;
+	else if (strcmp(family->name, "VFS_DQUOT") == 0)
+		id = GENL_ID_VFS_DQUOT;
 	else
 		id = find_first_zero_bit(mc_groups,
 					 mc_groups_longs * BITS_PER_LONG);
-- 
cgit v1.2.3


From 358f24704f2f016af7d504b357cdf32606091d07 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali.rohar@gmail.com>
Date: Tue, 26 Nov 2013 11:17:02 -0800
Subject: Input: add key code for ambient light sensor button
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many notebooks have a special button for enabling/disabling ambient
light sensor.

Signed-off-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index a3726275876d..7bacdb514377 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -719,6 +719,8 @@ struct input_keymap_entry {
 #define BTN_DPAD_LEFT		0x222
 #define BTN_DPAD_RIGHT		0x223
 
+#define KEY_ALS_TOGGLE		0x230	/* Ambient light sensor */
+
 #define BTN_TRIGGER_HAPPY		0x2c0
 #define BTN_TRIGGER_HAPPY1		0x2c0
 #define BTN_TRIGGER_HAPPY2		0x2c1
-- 
cgit v1.2.3


From 1e31aa9270daab40c7aef9d5488982e3475b87ef Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Wed, 27 Nov 2013 08:58:41 -0800
Subject: misc: mic: Fix user space namespace pollution from mic_common.h.

Avoid declaring ALIGN() and __aligned() in
include/uapi/linux/mic_common.h since they pollute user space
namespace. Also, mic_aligned_size() can be simply replaced simply by
sizeof() since all structures where mic_aligned_size() is used are
declared using __attribute__ ((aligned(8)));

--
>From mail from H Peter Anvin about this:

On Fri, Nov 08, 2013 H Peter Anvin <h.peter.anvin@intel.com> wrote:
Subject: Namespace pollution in mic_common.h

This puts two macros, ALIGN() and __aligned(), into arbitrary user space
namespace.  This really isn't safe or acceptable, especially since those
symbols are highly generic.
...
When these structures are forced-aligned, they will in fact have padding
automatically added by the compiler to an 8-byte boundary anyway, so
mic_aligned_size() does nothing.
...

Reported-by: H Peter Anvin <h.peter.anvin@intel.com>
Reviewed-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/mic/mpssd/mpssd.c    |  4 ++--
 drivers/misc/mic/card/mic_virtio.c |  4 ++--
 drivers/misc/mic/card/mic_virtio.h |  7 +++----
 drivers/misc/mic/host/mic_virtio.c |  2 +-
 include/uapi/linux/mic_common.h    | 26 +++++++++-----------------
 5 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/mic/mpssd/mpssd.c b/Documentation/mic/mpssd/mpssd.c
index 5c7fdda56f0e..91f9e8e4ef7b 100644
--- a/Documentation/mic/mpssd/mpssd.c
+++ b/Documentation/mic/mpssd/mpssd.c
@@ -313,7 +313,7 @@ static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type)
 	int i;
 	void *dp = get_dp(mic, type);
 
-	for (i = mic_aligned_size(struct mic_bootparam); i < PAGE_SIZE;
+	for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE;
 		i += mic_total_desc_size(d)) {
 		d = dp + i;
 
@@ -520,7 +520,7 @@ static void *
 virtio_net(void *arg)
 {
 	static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)];
-	static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __aligned(64);
+	static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64)));
 	struct iovec vnet_iov[2][2] = {
 		{ { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) },
 		  { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } },
diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c
index 4dce912f9ea6..20c567b1760b 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -520,8 +520,8 @@ static void mic_scan_devices(struct mic_driver *mdrv, bool remove)
 	struct device *dev;
 	int ret;
 
-	for (i = mic_aligned_size(struct mic_bootparam);
-		i < MIC_DP_SIZE; i += mic_total_desc_size(d)) {
+	for (i = sizeof(struct mic_bootparam); i < MIC_DP_SIZE;
+		i += mic_total_desc_size(d)) {
 		d = mdrv->dp + i;
 		dc = (void __iomem *)d + mic_aligned_desc_size(d);
 		/*
diff --git a/drivers/misc/mic/card/mic_virtio.h b/drivers/misc/mic/card/mic_virtio.h
index 2c5c22c93ba8..d0407ba53bb7 100644
--- a/drivers/misc/mic/card/mic_virtio.h
+++ b/drivers/misc/mic/card/mic_virtio.h
@@ -42,8 +42,8 @@
 
 static inline unsigned mic_desc_size(struct mic_device_desc __iomem *desc)
 {
-	return mic_aligned_size(*desc)
-		+ ioread8(&desc->num_vq) * mic_aligned_size(struct mic_vqconfig)
+	return sizeof(*desc)
+		+ ioread8(&desc->num_vq) * sizeof(struct mic_vqconfig)
 		+ ioread8(&desc->feature_len) * 2
 		+ ioread8(&desc->config_len);
 }
@@ -67,8 +67,7 @@ mic_vq_configspace(struct mic_device_desc __iomem *desc)
 }
 static inline unsigned mic_total_desc_size(struct mic_device_desc __iomem *desc)
 {
-	return mic_aligned_desc_size(desc) +
-		mic_aligned_size(struct mic_device_ctrl);
+	return mic_aligned_desc_size(desc) + sizeof(struct mic_device_ctrl);
 }
 
 int mic_devices_init(struct mic_driver *mdrv);
diff --git a/drivers/misc/mic/host/mic_virtio.c b/drivers/misc/mic/host/mic_virtio.c
index 945a3d0fe227..efe8da4401e9 100644
--- a/drivers/misc/mic/host/mic_virtio.c
+++ b/drivers/misc/mic/host/mic_virtio.c
@@ -467,7 +467,7 @@ static int mic_copy_dp_entry(struct mic_vdev *mvdev,
 	}
 
 	/* Find the first free device page entry */
-	for (i = mic_aligned_size(struct mic_bootparam);
+	for (i = sizeof(struct mic_bootparam);
 		i < MIC_DP_SIZE - mic_total_desc_size(dd_config);
 		i += mic_total_desc_size(devp)) {
 		devp = mdev->dp + i;
diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h
index 17e7d95e4f53..d0ec46d88d2a 100644
--- a/include/uapi/linux/mic_common.h
+++ b/include/uapi/linux/mic_common.h
@@ -23,12 +23,7 @@
 
 #include <linux/virtio_ring.h>
 
-#ifndef __KERNEL__
-#define ALIGN(a, x)	(((a) + (x) - 1) & ~((x) - 1))
-#define __aligned(x)	__attribute__ ((aligned(x)))
-#endif
-
-#define mic_aligned_size(x) ALIGN(sizeof(x), 8)
+#define __mic_align(a, x) (((a) + (x) - 1) & ~((x) - 1))
 
 /**
  * struct mic_device_desc: Virtio device information shared between the
@@ -49,7 +44,7 @@ struct mic_device_desc {
 	__u8 config_len;
 	__u8 status;
 	__u64 config[0];
-} __aligned(8);
+} __attribute__ ((aligned(8)));
 
 /**
  * struct mic_device_ctrl: Per virtio device information in the device page
@@ -74,7 +69,7 @@ struct mic_device_ctrl {
 	__u8 used_address_updated;
 	__s8 c2h_vdev_db;
 	__s8 h2c_vdev_db;
-} __aligned(8);
+} __attribute__ ((aligned(8)));
 
 /**
  * struct mic_bootparam: Virtio device independent information in device page
@@ -93,7 +88,7 @@ struct mic_bootparam {
 	__s8 h2c_config_db;
 	__u8 shutdown_status;
 	__u8 shutdown_card;
-} __aligned(8);
+} __attribute__ ((aligned(8)));
 
 /**
  * struct mic_device_page: High level representation of the device page
@@ -119,7 +114,7 @@ struct mic_vqconfig {
 	__u64 address;
 	__u64 used_address;
 	__u16 num;
-} __aligned(8);
+} __attribute__ ((aligned(8)));
 
 /*
  * The alignment to use between consumer and producer parts of vring.
@@ -173,15 +168,13 @@ struct mic_vring {
 	int len;
 };
 
-#define mic_aligned_desc_size(d) ALIGN(mic_desc_size(d), 8)
+#define mic_aligned_desc_size(d) __mic_align(mic_desc_size(d), 8)
 
 #ifndef INTEL_MIC_CARD
 static inline unsigned mic_desc_size(const struct mic_device_desc *desc)
 {
-	return mic_aligned_size(*desc)
-		+ desc->num_vq * mic_aligned_size(struct mic_vqconfig)
-		+ desc->feature_len * 2
-		+ desc->config_len;
+	return sizeof(*desc) + desc->num_vq * sizeof(struct mic_vqconfig)
+		+ desc->feature_len * 2 + desc->config_len;
 }
 
 static inline struct mic_vqconfig *
@@ -201,8 +194,7 @@ static inline __u8 *mic_vq_configspace(const struct mic_device_desc *desc)
 }
 static inline unsigned mic_total_desc_size(struct mic_device_desc *desc)
 {
-	return mic_aligned_desc_size(desc) +
-		mic_aligned_size(struct mic_device_ctrl);
+	return mic_aligned_desc_size(desc) + sizeof(struct mic_device_ctrl);
 }
 #endif
 
-- 
cgit v1.2.3


From 173c07278763850bfee57eec442dce38855d6f13 Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Wed, 27 Nov 2013 08:58:42 -0800
Subject: misc: mic: Fix endianness issues.

Endianness issues are now consistent as per the documentation in
host/mic_virtio.h. Sparse warnings related to endianness are also fixed.
Note that the MIC driver implementation assumes that the host can be
both BE or LE whereas the card is always LE.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Sudeep Dutt <sudeep.dutt@intel.com>
Reviewed-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/mic/mpssd/mpssd.c    |  8 ++++----
 drivers/misc/mic/card/mic_virtio.c | 14 +++++++-------
 drivers/misc/mic/host/mic_boot.c   |  2 +-
 drivers/misc/mic/host/mic_virtio.c | 14 +++++++-------
 drivers/misc/mic/host/mic_x100.c   |  4 ++--
 include/uapi/linux/mic_common.h    | 14 +++++++-------
 6 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/mic/mpssd/mpssd.c b/Documentation/mic/mpssd/mpssd.c
index 91f9e8e4ef7b..4d17487d5ad9 100644
--- a/Documentation/mic/mpssd/mpssd.c
+++ b/Documentation/mic/mpssd/mpssd.c
@@ -445,8 +445,8 @@ init_vr(struct mic_info *mic, int fd, int type,
 		__func__, mic->name, vr0->va, vr0->info, vr_size,
 		vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
 	mpsslog("magic 0x%x expected 0x%x\n",
-		vr0->info->magic, MIC_MAGIC + type);
-	assert(vr0->info->magic == MIC_MAGIC + type);
+		le32toh(vr0->info->magic), MIC_MAGIC + type);
+	assert(le32toh(vr0->info->magic) == MIC_MAGIC + type);
 	if (vr1) {
 		vr1->va = (struct mic_vring *)
 			&va[MIC_DEVICE_PAGE_END + vr_size];
@@ -458,8 +458,8 @@ init_vr(struct mic_info *mic, int fd, int type,
 			__func__, mic->name, vr1->va, vr1->info, vr_size,
 			vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
 		mpsslog("magic 0x%x expected 0x%x\n",
-			vr1->info->magic, MIC_MAGIC + type + 1);
-		assert(vr1->info->magic == MIC_MAGIC + type + 1);
+			le32toh(vr1->info->magic), MIC_MAGIC + type + 1);
+		assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1);
 	}
 done:
 	return va;
diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c
index 20c567b1760b..ca0445fa3d83 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -248,17 +248,16 @@ static struct virtqueue *mic_find_vq(struct virtio_device *vdev,
 	/* First assign the vring's allocated in host memory */
 	vqconfig = mic_vq_config(mvdev->desc) + index;
 	memcpy_fromio(&config, vqconfig, sizeof(config));
-	_vr_size = vring_size(config.num, MIC_VIRTIO_RING_ALIGN);
+	_vr_size = vring_size(le16_to_cpu(config.num), MIC_VIRTIO_RING_ALIGN);
 	vr_size = PAGE_ALIGN(_vr_size + sizeof(struct _mic_vring_info));
-	va = mic_card_map(mvdev->mdev, config.address, vr_size);
+	va = mic_card_map(mvdev->mdev, le64_to_cpu(config.address), vr_size);
 	if (!va)
 		return ERR_PTR(-ENOMEM);
 	mvdev->vr[index] = va;
 	memset_io(va, 0x0, _vr_size);
-	vq = vring_new_virtqueue(index,
-				config.num, MIC_VIRTIO_RING_ALIGN, vdev,
-				false,
-				va, mic_notify, callback, name);
+	vq = vring_new_virtqueue(index, le16_to_cpu(config.num),
+				 MIC_VIRTIO_RING_ALIGN, vdev, false, va,
+				 mic_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
@@ -273,7 +272,8 @@ static struct virtqueue *mic_find_vq(struct virtio_device *vdev,
 
 	/* Allocate and reassign used ring now */
 	mvdev->used_size[index] = PAGE_ALIGN(sizeof(__u16) * 3 +
-			sizeof(struct vring_used_elem) * config.num);
+					     sizeof(struct vring_used_elem) *
+					     le16_to_cpu(config.num));
 	used = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 					get_order(mvdev->used_size[index]));
 	if (!used) {
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index 7558d9186438..b75c6b5cc20f 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -62,7 +62,7 @@ void mic_bootparam_init(struct mic_device *mdev)
 {
 	struct mic_bootparam *bootparam = mdev->dp;
 
-	bootparam->magic = MIC_MAGIC;
+	bootparam->magic = cpu_to_le32(MIC_MAGIC);
 	bootparam->c2h_shutdown_db = mdev->shutdown_db;
 	bootparam->h2c_shutdown_db = -1;
 	bootparam->h2c_config_db = -1;
diff --git a/drivers/misc/mic/host/mic_virtio.c b/drivers/misc/mic/host/mic_virtio.c
index efe8da4401e9..453d7409da9c 100644
--- a/drivers/misc/mic/host/mic_virtio.c
+++ b/drivers/misc/mic/host/mic_virtio.c
@@ -293,8 +293,8 @@ static void mic_virtio_init_post(struct mic_vdev *mvdev)
 			continue;
 		}
 		mvdev->mvr[i].vrh.vring.used =
-			mvdev->mdev->aper.va +
-			le64_to_cpu(vqconfig[i].used_address);
+			mvdev->mdev->aper.va
+			+ le64_to_cpu(vqconfig[i].used_address);
 	}
 
 	mvdev->dc->used_address_updated = 0;
@@ -525,6 +525,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev,
 	char irqname[10];
 	struct mic_bootparam *bootparam = mdev->dp;
 	u16 num;
+	dma_addr_t vr_addr;
 
 	mutex_lock(&mdev->mic_mutex);
 
@@ -559,17 +560,16 @@ int mic_virtio_add_device(struct mic_vdev *mvdev,
 		}
 		vr->len = vr_size;
 		vr->info = vr->va + vring_size(num, MIC_VIRTIO_RING_ALIGN);
-		vr->info->magic = MIC_MAGIC + mvdev->virtio_id + i;
-		vqconfig[i].address = mic_map_single(mdev,
-			vr->va, vr_size);
-		if (mic_map_error(vqconfig[i].address)) {
+		vr->info->magic = cpu_to_le32(MIC_MAGIC + mvdev->virtio_id + i);
+		vr_addr = mic_map_single(mdev, vr->va, vr_size);
+		if (mic_map_error(vr_addr)) {
 			free_pages((unsigned long)vr->va, get_order(vr_size));
 			ret = -ENOMEM;
 			dev_err(mic_dev(mvdev), "%s %d err %d\n",
 				__func__, __LINE__, ret);
 			goto err;
 		}
-		vqconfig[i].address = cpu_to_le64(vqconfig[i].address);
+		vqconfig[i].address = cpu_to_le64(vr_addr);
 
 		vring_init(&vr->vr, num, vr->va, MIC_VIRTIO_RING_ALIGN);
 		ret = vringh_init_kern(&mvr->vrh,
diff --git a/drivers/misc/mic/host/mic_x100.c b/drivers/misc/mic/host/mic_x100.c
index 81e9541b784c..0dfa8a81436e 100644
--- a/drivers/misc/mic/host/mic_x100.c
+++ b/drivers/misc/mic/host/mic_x100.c
@@ -397,8 +397,8 @@ mic_x100_load_ramdisk(struct mic_device *mdev)
 	 * so copy over the ramdisk @ 128M.
 	 */
 	memcpy_toio(mdev->aper.va + (mdev->bootaddr << 1), fw->data, fw->size);
-	iowrite32(cpu_to_le32(mdev->bootaddr << 1), &bp->hdr.ramdisk_image);
-	iowrite32(cpu_to_le32(fw->size), &bp->hdr.ramdisk_size);
+	iowrite32(mdev->bootaddr << 1, &bp->hdr.ramdisk_image);
+	iowrite32(fw->size, &bp->hdr.ramdisk_size);
 	release_firmware(fw);
 error:
 	return rc;
diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h
index d0ec46d88d2a..6eb40244e019 100644
--- a/include/uapi/linux/mic_common.h
+++ b/include/uapi/linux/mic_common.h
@@ -43,7 +43,7 @@ struct mic_device_desc {
 	__u8 feature_len;
 	__u8 config_len;
 	__u8 status;
-	__u64 config[0];
+	__le64 config[0];
 } __attribute__ ((aligned(8)));
 
 /**
@@ -61,7 +61,7 @@ struct mic_device_desc {
  * @h2c_vdev_db: The doorbell number to be used by host. Set by guest.
  */
 struct mic_device_ctrl {
-	__u64 vdev;
+	__le64 vdev;
 	__u8 config_change;
 	__u8 vdev_reset;
 	__u8 guest_ack;
@@ -82,7 +82,7 @@ struct mic_device_ctrl {
  * @shutdown_card: Set to 1 by the host when a card shutdown is initiated
  */
 struct mic_bootparam {
-	__u32 magic;
+	__le32 magic;
 	__s8 c2h_shutdown_db;
 	__s8 h2c_shutdown_db;
 	__s8 h2c_config_db;
@@ -111,9 +111,9 @@ struct mic_device_page {
  * @num: The number of entries in the virtio_ring
  */
 struct mic_vqconfig {
-	__u64 address;
-	__u64 used_address;
-	__u16 num;
+	__le64 address;
+	__le64 used_address;
+	__le16 num;
 } __attribute__ ((aligned(8)));
 
 /*
@@ -149,7 +149,7 @@ struct mic_vqconfig {
  */
 struct _mic_vring_info {
 	__u16 avail_idx;
-	int magic;
+	__le32 magic;
 };
 
 /**
-- 
cgit v1.2.3


From 31e20bad8d5871c6f8673c28ea26ab7a0e520086 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 28 Nov 2013 18:31:05 +0100
Subject: diag: warn about missing first netlink attribute

The first netlink attribute (value 0) must always be defined as none/unspec.
This is correctly done in inet_diag.h, but other diag interfaces are wrong.

Because we cannot change an existing API, I add a comment to point the mistake
and avoid to propagate it in a new diag API in the future.

CC: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink_diag.h | 1 +
 include/uapi/linux/packet_diag.h  | 1 +
 include/uapi/linux/unix_diag.h    | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index 4e31db4eea41..f2159d30d1f5 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -33,6 +33,7 @@ struct netlink_diag_ring {
 };
 
 enum {
+	/* NETLINK_DIAG_NONE, standard nl API requires this attribute!  */
 	NETLINK_DIAG_MEMINFO,
 	NETLINK_DIAG_GROUPS,
 	NETLINK_DIAG_RX_RING,
diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h
index b2cc0cd9c4d9..d08c63f3dd6f 100644
--- a/include/uapi/linux/packet_diag.h
+++ b/include/uapi/linux/packet_diag.h
@@ -29,6 +29,7 @@ struct packet_diag_msg {
 };
 
 enum {
+	/* PACKET_DIAG_NONE, standard nl API requires this attribute!  */
 	PACKET_DIAG_INFO,
 	PACKET_DIAG_MCLIST,
 	PACKET_DIAG_RX_RING,
diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
index b9e2a6a7446f..1eb0b8dd1830 100644
--- a/include/uapi/linux/unix_diag.h
+++ b/include/uapi/linux/unix_diag.h
@@ -31,6 +31,7 @@ struct unix_diag_msg {
 };
 
 enum {
+	/* UNIX_DIAG_NONE, standard nl API requires this attribute!  */
 	UNIX_DIAG_NAME,
 	UNIX_DIAG_VFS,
 	UNIX_DIAG_PEER,
-- 
cgit v1.2.3


From 5e53e689b737526308db2b5c9f56e9d0371a1676 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sun, 24 Nov 2013 21:09:26 +0100
Subject: genetlink/pmcraid: use proper genetlink multicast API

The pmcraid driver is abusing the genetlink API and is using its
family ID as the multicast group ID, which is invalid and may
belong to somebody else (and likely will.)

Make it use the correct API, but since this may already be used
as-is by userspace, reserve a family ID for this code and also
reserve that group ID to not break userspace assumptions.

My previous patch broke event delivery in the driver as I missed
that it wasn't using the right API and forgot to update it later
in my series.

While changing this, I noticed that the genetlink code could use
the static group ID instead of a strcmp(), so also do that for
the VFS_DQUOT family.

Cc: Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/pmcraid.c         | 20 +++++++++++++++-----
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        | 11 +++++++++--
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index bd6f743d87a7..892ea6161376 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1404,11 +1404,22 @@ enum {
 };
 #define PMCRAID_AEN_CMD_MAX (__PMCRAID_AEN_CMD_MAX - 1)
 
+static struct genl_multicast_group pmcraid_mcgrps[] = {
+	{ .name = "events", /* not really used - see ID discussion below */ },
+};
+
 static struct genl_family pmcraid_event_family = {
-	.id = GENL_ID_GENERATE,
+	/*
+	 * Due to prior multicast group abuse (the code having assumed that
+	 * the family ID can be used as a multicast group ID) we need to
+	 * statically allocate a family (and thus group) ID.
+	 */
+	.id = GENL_ID_PMCRAID,
 	.name = "pmcraid",
 	.version = 1,
-	.maxattr = PMCRAID_AEN_ATTR_MAX
+	.maxattr = PMCRAID_AEN_ATTR_MAX,
+	.mcgrps = pmcraid_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(pmcraid_mcgrps),
 };
 
 /**
@@ -1511,9 +1522,8 @@ static int pmcraid_notify_aen(
 		return result;
 	}
 
-	result =
-		genlmsg_multicast(&pmcraid_event_family, skb, 0,
-				  pmcraid_event_family.id, GFP_ATOMIC);
+	result = genlmsg_multicast(&pmcraid_event_family, skb,
+				   0, 0, GFP_ATOMIC);
 
 	/* If there are no listeners, genlmsg_multicast may return non-zero
 	 * value.
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index 1af72d8228e0..c3363ba1ae05 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -28,6 +28,7 @@ struct genlmsghdr {
 #define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
 #define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
+#define GENL_ID_PMCRAID		(NLMSG_MIN_TYPE + 2)
 
 /**************************************************************************
  * Controller
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 803206e82450..713671ae45af 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -74,9 +74,12 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
  * Bit 17 is marked as already used since the VFS quota code
  * also abused this API and relied on family == group ID, we
  * cater to that by giving it a static family and group ID.
+ * Bit 18 is marked as already used since the PMCRAID driver
+ * did the same thing as the VFS quota code (maybe copied?)
  */
 static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
-				      BIT(GENL_ID_VFS_DQUOT);
+				      BIT(GENL_ID_VFS_DQUOT) |
+				      BIT(GENL_ID_PMCRAID);
 static unsigned long *mc_groups = &mc_group_start;
 static unsigned long mc_groups_longs = 1;
 
@@ -139,6 +142,7 @@ static u16 genl_generate_id(void)
 
 	for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
 		if (id_gen_idx != GENL_ID_VFS_DQUOT &&
+		    id_gen_idx != GENL_ID_PMCRAID &&
 		    !genl_family_find_byid(id_gen_idx))
 			return id_gen_idx;
 		if (++id_gen_idx > GENL_MAX_ID)
@@ -236,9 +240,12 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
 	} else if (strcmp(family->name, "NET_DM") == 0) {
 		first_id = 1;
 		BUG_ON(n_groups != 1);
-	} else if (strcmp(family->name, "VFS_DQUOT") == 0) {
+	} else if (family->id == GENL_ID_VFS_DQUOT) {
 		first_id = GENL_ID_VFS_DQUOT;
 		BUG_ON(n_groups != 1);
+	} else if (family->id == GENL_ID_PMCRAID) {
+		first_id = GENL_ID_PMCRAID;
+		BUG_ON(n_groups != 1);
 	} else {
 		groups_allocated = true;
 		err = genl_allocate_reserve_groups(n_groups, &first_id);
-- 
cgit v1.2.3


From 98bf8362220af717862b8262b21348774890b7b4 Mon Sep 17 00:00:00 2001
From: Arvid Brodin <arvid.brodin@alten.se>
Date: Fri, 29 Nov 2013 23:38:16 +0100
Subject: net/hsr: Support iproute print_opt ('ip -details ...')

This implements the rtnl_link_ops fill_info routine for HSR.

Signed-off-by: Arvid Brodin <arvid.brodin@alten.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  4 +++-
 net/hsr/hsr_netlink.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b78566f59aba..6db460121f84 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -488,7 +488,9 @@ enum {
 	IFLA_HSR_UNSPEC,
 	IFLA_HSR_SLAVE1,
 	IFLA_HSR_SLAVE2,
-	IFLA_HSR_MULTICAST_SPEC,
+	IFLA_HSR_MULTICAST_SPEC,	/* Last byte of supervision addr */
+	IFLA_HSR_SUPERVISION_ADDR,	/* Supervision frame multicast addr */
+	IFLA_HSR_SEQ_NR,
 	__IFLA_HSR_MAX,
 };
 
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 5325af85eea6..01a5261ac7a5 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -23,6 +23,8 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
 	[IFLA_HSR_SLAVE1]		= { .type = NLA_U32 },
 	[IFLA_HSR_SLAVE2]		= { .type = NLA_U32 },
 	[IFLA_HSR_MULTICAST_SPEC]	= { .type = NLA_U8 },
+	[IFLA_HSR_SUPERVISION_ADDR]	= { .type = NLA_BINARY, .len = ETH_ALEN },
+	[IFLA_HSR_SEQ_NR]		= { .type = NLA_U16 },
 };
 
 
@@ -59,6 +61,31 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 	return hsr_dev_finalize(dev, link, multicast_spec);
 }
 
+static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct hsr_priv *hsr_priv;
+
+	hsr_priv = netdev_priv(dev);
+
+	if (hsr_priv->slave[0])
+		if (nla_put_u32(skb, IFLA_HSR_SLAVE1, hsr_priv->slave[0]->ifindex))
+			goto nla_put_failure;
+
+	if (hsr_priv->slave[1])
+		if (nla_put_u32(skb, IFLA_HSR_SLAVE2, hsr_priv->slave[1]->ifindex))
+			goto nla_put_failure;
+
+	if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN,
+		    hsr_priv->sup_multicast_addr) ||
+	    nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr_priv->sequence_nr))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static struct rtnl_link_ops hsr_link_ops __read_mostly = {
 	.kind		= "hsr",
 	.maxtype	= IFLA_HSR_MAX,
@@ -66,6 +93,7 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = {
 	.priv_size	= sizeof(struct hsr_priv),
 	.setup		= hsr_dev_setup,
 	.newlink	= hsr_newlink,
+	.fill_info	= hsr_fill_info,
 };
 
 
-- 
cgit v1.2.3


From 95f19f658ce177387345b51bd48fed7b9cd7d4e8 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Fri, 15 Nov 2013 12:56:31 +0530
Subject: epoll: drop EPOLLWAKEUP if PM_SLEEP is disabled

Drop EPOLLWAKEUP from epoll events mask if CONFIG_PM_SLEEP is disabled.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 fs/eventpoll.c                 |  3 +--
 include/uapi/linux/eventpoll.h | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 79b65c3b9e87..8b5e2584c840 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,8 +1852,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
-	if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
-		epds.events &= ~EPOLLWAKEUP;
+	ep_take_care_of_epollwakeup(&epds);
 
 	/*
 	 * We have to check that the file structure underneath the file descriptor
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 2c267bcbb85c..bc81fb2e1f0e 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -61,5 +61,16 @@ struct epoll_event {
 	__u64 data;
 } EPOLL_PACKED;
 
-
+#ifdef CONFIG_PM_SLEEP
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+	if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
+		epev->events &= ~EPOLLWAKEUP;
+}
+#else
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+	epev->events &= ~EPOLLWAKEUP;
+}
+#endif
 #endif /* _UAPI_LINUX_EVENTPOLL_H */
-- 
cgit v1.2.3


From 3b5a7ab40c2a1bfd89dd5b4bbfe98f8692d5fd62 Mon Sep 17 00:00:00 2001
From: Ping Cheng <pinglinux@gmail.com>
Date: Thu, 5 Dec 2013 12:42:09 -0800
Subject: Input: add SW_MUTE_DEVICE switch definition

Some devices, such as new Intuos series tablets, have a hardware switch to
turn touch data on/off. To report the state, SW_MUTE_DEVICE is added
in include/uapi/linux/input.h.

Reviewed_by: Chris Bagwell <chris@cnpbagwell.com>
Acked-by: Peter Hutterer <peter.hutterer@who-t.net>
Tested-by: Jason Gerecke <killertofu@gmail.com>
Signed-off-by: Ping Cheng <pingc@wacom.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 7bacdb514377..ecc88592ecbe 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -858,6 +858,7 @@ struct input_keymap_entry {
 #define SW_FRONT_PROXIMITY	0x0b  /* set = front proximity sensor active */
 #define SW_ROTATE_LOCK		0x0c  /* set = rotate locked/disabled */
 #define SW_LINEIN_INSERT	0x0d  /* set = inserted */
+#define SW_MUTE_DEVICE		0x0e  /* set = device disabled */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
-- 
cgit v1.2.3


From 2a4d81547b88b4ff566d4b4f9c0e36285ed07634 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 15 Dec 2013 21:00:48 -0800
Subject: Input: define KEY_WWAN for Wireless WAN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices with support for mobile networks may have buttons for
enabling/disabling such connection. An example can be Linksys router 54G3G.
We already have KEY_BLUETOOTH, KEY_WLAN and KEY_UWB so it makes sense to
add KEY_WWAN as well.  As we already have KEY_WIMAX, use it's value for
KEY_WWAN and make it an alias.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index ecc88592ecbe..bd24470d24a2 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -464,7 +464,8 @@ struct input_keymap_entry {
 #define KEY_BRIGHTNESS_ZERO	244	/* brightness off, use ambient */
 #define KEY_DISPLAY_OFF		245	/* display device to off state */
 
-#define KEY_WIMAX		246
+#define KEY_WWAN		246	/* Wireless WAN (LTE, UMTS, GSM, etc.) */
+#define KEY_WIMAX		KEY_WWAN
 #define KEY_RFKILL		247	/* Key that controls all radios */
 
 #define KEY_MICMUTE		248	/* Mute / unmute the microphone */
-- 
cgit v1.2.3


From 189b84fb54490ae24111124346a8e63f8e019385 Mon Sep 17 00:00:00 2001
From: Vince Weaver <vince@deater.net>
Date: Fri, 13 Dec 2013 15:52:25 -0500
Subject: perf: Document the new transaction sample type

Commit fdfbbd07e91f8fe3871 ("perf: Add generic transaction flags")
added support for PERF_SAMPLE_TRANSACTION but forgot to add documentation
for the sample type to include/uapi/linux/perf_event.h

Signed-off-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1312131548450.10372@pianoman.cluster.toy
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e1802d6153ae..959d454f76a1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -679,6 +679,7 @@ enum perf_event_type {
 	 *
 	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
+	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
-- 
cgit v1.2.3